setwd("~/GitHub/Forecasting-Tournament") #igor's working directory
dat <- read.csv("dat_for_analyses.csv", stringsAsFactors = FALSE)
dat_long <- read.csv("dat_long.csv", stringsAsFactors = FALSE)
#add simulation benchmarks
load("sim/BenchmarkData_Combined.RData")

sim.w1 <- Stats_all_benchmarks_raw
sim.w1$Wave <-"First Tournament (May 2020)"
sim.w1 <- subset(sim.w1,source!="Experts"&source!="Lay People")
sim.w1$response <- sim.w1$Mean
sim.w1$lower.CL <- sim.w1$CI_L
sim.w1$upper.CL <- sim.w1$CI_U
sim.w1$Type[sim.w1$source=="Benchmark 1"]<-"Historic Mean"
## Warning: Unknown or uninitialised column: `Type`.
sim.w1$Type[sim.w1$source=="Benchmark 2"]<-"Random Walk"
sim.w1$Type[sim.w1$source=="Benchmark 3"]<-"Linear Regression"

load("sim/BenchmarkData_Combined_W2.RData")
sim.w2<-Stats_all_benchmarks_raw_w2
sim.w2$Wave<-"Second Tournament (Nov 2020)"
sim.w2<-subset(sim.w2,source!="ExpertsW2")
sim.w2$response<-sim.w2$Mean
sim.w2$lower.CL<-sim.w2$CI_L
sim.w2$upper.CL<-sim.w2$CI_U
sim.w2$Type[sim.w1$source=="Benchmark 1"]<-"Historic Mean"
## Warning: Unknown or uninitialised column: `Type`.
sim.w2$Type[sim.w1$source=="Benchmark 2"]<-"Random Walk"
sim.w2$Type[sim.w1$source=="Benchmark 3"]<-"Linear Regression"
#get simulation-based random walk cut-offs - to be used for inspection of top teams

#ADD PART how to use RW SIM scores per domain per wave to get the cutoff scores.

##
#subset benchmark, first
sim.w1.rw <- sim.w1 %>% 
  filter(Type == 'Random Walk') %>% 
  mutate(rw.MASE.w1 = response) %>% 
  dplyr::select(domain, rw.MASE.w1)

sim.w2.rw <- sim.w2 %>% 
  filter(Type == 'Random Walk') %>% 
  mutate(rw.MASE.w2 = response) %>% 
  dplyr::select(domain, rw.MASE.w2)

##

## add to the datafile
dat <- dat %>% 
  left_join(sim.w1.rw)
## Joining with `by = join_by(domain)`
dat <- dat %>% 
  left_join(sim.w2.rw)
## Joining with `by = join_by(domain)`
## create cut-offs
dat$compare_to_naive_rwf_MASE<-NA #first set to NA
dat<-dat %>% #create difference score between MASE of estimate and RW MASE
mutate(diff_to_naive_rwf_MASE = case_when(
      phase==1 ~ MASE1_w1 - rw.MASE.w1 ,
      phase==2 ~ MASE1_w2 - rw.MASE.w2 
    ))

#check number of NAs
#View(dat[is.na(dat$diff_to_naive_rwf_MASE),c("ResponseId", "team_name", "domain", "compare_to_naive_rwf","phase", "Month.1" ,
#                                            "Month.2" ,"Month.3" ,"Month.4" ,"Month.5" ,"Month.6","Month.7",
#                                           "Month.8" ,"Month.9" ,"Month.10" ,"Month.11" ,"Month.12","MASE1_w2", "rw.MASE.w2")]) 

dat<-dat %>% #use the diff score values to calculate the cut offs (for graphs) later on 
mutate(compare_to_naive_rwf_MASE = case_when(
      diff_to_naive_rwf_MASE < 1 ~ "Below Random Walk",
      diff_to_naive_rwf_MASE == 1 ~ "Equal to Random Walk",
      diff_to_naive_rwf_MASE > 1 ~ "Above Random Walk"
    ))
#cross-check data for NAs
#View(dat[is.na(dat$compare_to_naive_rwf_MASE),c("ResponseId", "team_name", "domain", "compare_to_naive_rwf","phase","Month.1" ,
#                                            "Month.2" ,"Month.3" ,"Month.4" ,"Month.5" ,"Month.6","Month.7",
#                                            "Month.8" ,"Month.9" ,"Month.10" ,"Month.11" ,"Month.12" ,                                           "MASE1_w1", "rw.MASE.w1")]) 
# dataset that only includes academic predictions and those who provided open-ended data
academic_only <- filter(dat, isExpert == 1 )

# View(dat[,c("ResponseId", "team_name", "isExpert", "MASE1_w2", "MASE1_w2")])
#note that missing NAs for isExpert at the end of the file is by design,  these are naïve benchmark estimates

#datasets that are filtered by phase (1 = May submission, 2 = November submission)
phase1 <- filter(dat, phase == 1)
phase2 <- filter(dat, phase == 2)

# Phase 1 & 2 further filtered to only include academics won't be necessary once we have updated objective data
phase1_exp <- filter(phase1, isExpert == 1)
phase2_exp <- filter(phase2, isExpert == 1)

objective <- dat %>% 
  filter(Method == "Objective", phase == 1) %>% 
  dplyr::select(domain:Month.12)
# Rank order the performance of all teams for each domain using MASE scores - academics only
t1.academ.sorted <- phase1_exp %>%
      arrange(domain, MASE1_w1) %>%
      group_by(domain) %>% 
      mutate(Rank = row_number()) %>% 
      add_count(name="Nteams") %>% 
      dplyr::select(team_name, domain, Rank, Nteams, Method.code, Month.1:Month.12,mean_abs_error_w1,MASE1_w1)

# Create intuitive labels for domains
t1.academ.sorted$Domains[t1.academ.sorted$domain=="eafric"]<-"Explicit African American Bias"
## Warning: Unknown or uninitialised column: `Domains`.
t1.academ.sorted$Domains[t1.academ.sorted$domain=="easian"]<-"Explicit Asian American Bias"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="egend"]<-"Explicit Gender-Career Bias"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="iafric"]<-"Implicit African American Bias"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="iasian"]<-"Implicit Asian American Bias"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="igend"]<-"Implicit Gender-Career Bias"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="ideoldem"]<-"Ideological Preferences for Democrats"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="ideolrep"]<-"Ideological Preferences for Republicans"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="lifesat"]<-"Life Satisfaction"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="negaffect"]<-"Negative Affect in Social Media"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="posaffect"]<-"Positive Affect in Social Media"
t1.academ.sorted$Domains[t1.academ.sorted$domain=="polar"]<-"Political Polarization"

# Compute average accuracy for each domain - non-academic only
t1.nonacadem.av.sorted <- phase1 %>% 
  filter(isExpert.factor == 'Prolific') %>%                                                               dplyr::select(team_name,domain,Month.1:Month.12,mean_abs_error_w1,MASE1_w1,Method.code) %>% 
  group_by(domain) %>% 
  summarise(across(where(is.numeric), mean)) %>% 
  arrange(domain,MASE1_w1) %>% 
  mutate(team_name="average non-academic")

# Compute median accuracy for each domain - non-academic only
t1.nonacadem.median.sorted<- phase1 %>% 
  filter(isExpert.factor == 'Prolific') %>%                                                               dplyr::select(team_name,domain,Month.1:Month.12,mean_abs_error_w1,MASE1_w1,,Method.code) %>% 
  group_by(domain) %>% 
  summarise(across(where(is.numeric), median)) %>% 
  arrange(domain,MASE1_w1) %>% 
  mutate(team_name="median non-academic")

# Compute best prediction for each of the domains - in other words prediction with the lowest MASE scores - non academics
t1.nonacadem.best.sorted <- phase1 %>% 
  filter(isExpert.factor == 'Prolific') %>%                                                               dplyr::select(team_name,domain,Month.1:Month.12,mean_abs_error_w1,MASE1_w1,Method.code) %>% 
  group_by(domain) %>% 
  summarise(across(where(is.numeric), min)) %>% 
  arrange(domain, MASE1_w1) %>% 
  mutate(team_name = "top non-academic")

# Best predictions for academics by domain
t1.academ.best.sorted <- phase1 %>% 
  filter(isExpert.factor == 'Academic') %>%                                                               dplyr::select(team_name,domain,Month.1:Month.12,mean_abs_error_w1,MASE1_w1,Method.code) %>% 
  group_by(domain) %>% 
  summarise(across(where(is.numeric), min)) %>% 
  arrange(domain,MASE1_w1) %>% 
  mutate(team_name="top academic")

# Combine the two datasets.
t1.top.scores <- rbind(t1.academ.best.sorted, t1.nonacadem.best.sorted) %>% 
  arrange(domain, MASE1_w1)
#so, only for life satisfaction and polarization, best academic was better than best non-academic. For all other domains, non-academics were in fact better (but note that the sample of non-academic was larger)

#what is the percentage of academics and lay people, respectively, who were below 1 on MASE?

t1.scores <- rbind(t1.academ.sorted, t1.nonacadem.median.sorted)
# write.csv(t1.scores,"wave1.scores.csv") 

# Rank order the performance of all teams for each domain using MASE scores - academics only
t2.academ.sorted <- academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain, MASE1_w2) %>%
  group_by(domain) %>% 
  mutate(Rank = row_number()) %>% 
  add_count(name="Nteams") %>% 
  dplyr::select(team_name,domain,Rank,Nteams,Method.code,phase,revised,Month.7:Month.12,mean_abs_error_w2,MASE1_w2)

# Create intuitive labels for each domain
t2.academ.sorted$Domains[t2.academ.sorted$domain=="eafric"]<-"Explicit African American Bias"
## Warning: Unknown or uninitialised column: `Domains`.
t2.academ.sorted$Domains[t2.academ.sorted$domain=="easian"]<-"Explicit Asian American Bias"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="egend"]<-"Explicit Gender-Career Bias"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="iafric"]<-"Implicit African American Bias"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="iasian"]<-"Implicit Asian American Bias"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="igend"]<-"Implicit Gender-Career Bias"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="ideoldem"]<-"Ideological Preferences for Democrats"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="ideolrep"]<-"Ideological Preferences for Republicans"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="lifesat"]<-"Life Satisfaction"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="negaffect"]<-"Negative Affect in Social Media"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="posaffect"]<-"Positive Affect in Social Media"
t2.academ.sorted$Domains[t2.academ.sorted$domain=="polar"]<-"Political Polarization"

# write.csv(t2.academ.sorted,"wave2.scores.csv") 

# Create intuitive labels for each domain
objective$Domains[objective$domain=="eafric"]<-"Explicit African American Bias"
objective$Domains[objective$domain=="easian"]<-"Explicit Asian American Bias"
objective$Domains[objective$domain=="egend"]<-"Explicit Gender-Career Bias"
objective$Domains[objective$domain=="iafric"]<-"Implicit African American Bias"
objective$Domains[objective$domain=="iasian"]<-"Implicit Asian American Bias"
objective$Domains[objective$domain=="igend"]<-"Implicit Gender-Career Bias"
objective$Domains[objective$domain=="ideoldem"]<-"Ideological Preferences for Democrats"
objective$Domains[objective$domain=="ideolrep"]<-"Ideological Preferences for Republicans"
objective$Domains[objective$domain=="lifesat"]<-"Life Satisfaction"
objective$Domains[objective$domain=="negaffect"]<-"Negative Affect in Social Media"
objective$Domains[objective$domain=="posaffect"]<-"Positive Affect in Social Media"
objective$Domains[objective$domain=="polar"]<-"Political Polarization"
# Create intuitive labels for each month
t1.academ.sorted <- t1.academ.sorted %>% rename(MASE=MASE1_w1,MAE=mean_abs_error_w1,
                                              May2020=Month.1,
                                              June2020=Month.2,
                                              July2020=Month.3,
                                              August2020=Month.4,
                                              Sept2020=Month.5,
                                              Oct2020=Month.6,
                                              Nov2020=Month.7,
                                              Dec2020=Month.8,
                                              Jan2021=Month.9,
                                              Feb2021=Month.10,
                                              March2021=Month.11,
                                              April2021=Month.12)

# Create tournament length variable
t1.academ.sorted$Tournament<-"May - 12-months"

# Examine St error across 12 months
t1.academ.sorted$stdev <- apply(t1.academ.sorted[c("May2020","June2020","July2020",  "August2020",
                                                      "Sept2020",    "Oct2020",     "Nov2020",     "Dec2020",
                                                      "Jan2021",     "Feb2021",     "March2021",   "April2021")], 1, sd)   

describeBy(t1.academ.sorted$stdev,group=t1.academ.sorted$domain)
## 
##  Descriptive statistics by group 
## group: eafric
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis se
## X1    1 22 0.01 0.01   0.01    0.01 0.01   0 0.04  0.04  1.1     0.11  0
## ------------------------------------------------------------ 
## group: easian
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis se
## X1    1 26 0.03 0.02   0.02    0.02 0.03   0 0.08  0.08 0.48    -1.14  0
## ------------------------------------------------------------ 
## group: egend
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 21 0.03 0.03   0.02    0.02 0.02   0 0.14  0.14 2.14     4.95 0.01
## ------------------------------------------------------------ 
## group: iafric
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis se
## X1    1 25 0.01 0.01      0       0 0.01   0 0.04  0.04 2.27     6.35  0
## ------------------------------------------------------------ 
## group: iasian
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis se
## X1    1 30 0.02 0.03   0.01    0.01 0.01   0 0.13  0.13 2.58     7.08  0
## ------------------------------------------------------------ 
## group: ideoldem
##    vars  n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 34 0.98 0.97   0.82    0.85 1.1   0 3.8   3.8 1.05     0.72 0.17
## ------------------------------------------------------------ 
## group: ideolrep
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 34 1.42 1.35   1.08    1.23 0.79   0 5.55  5.55 1.42     1.76 0.23
## ------------------------------------------------------------ 
## group: igend
##    vars  n mean   sd median trimmed mad min  max range skew kurtosis se
## X1    1 22 0.01 0.01      0       0   0   0 0.06  0.06 3.07     9.78  0
## ------------------------------------------------------------ 
## group: lifesat
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 57 0.13 0.19   0.05    0.08 0.07   0   1     1 2.93     9.24 0.03
## ------------------------------------------------------------ 
## group: negaffect
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 28  0.1 0.09    0.1     0.1 0.12   0 0.26  0.26 0.26    -1.19 0.02
## ------------------------------------------------------------ 
## group: polar
##    vars  n mean sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 32 2.55  2    2.3    2.33 1.66   0 7.67  7.67 0.86     0.06 0.35
## ------------------------------------------------------------ 
## group: posaffect
##    vars  n mean  sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 28 0.11 0.1   0.12    0.11 0.12   0 0.37  0.37 0.55    -0.42 0.02
t1.academ.sorted %>% group_by(domain) %>%
  dplyr::summarize(medianSD = median(stdev), min = min(stdev), max = max(stdev)) %>% arrange(desc(medianSD))
## # A tibble: 12 × 4
##    domain    medianSD   min    max
##    <chr>        <dbl> <dbl>  <dbl>
##  1 polar      2.30        0 7.67  
##  2 ideolrep   1.08        0 5.55  
##  3 ideoldem   0.817       0 3.80  
##  4 posaffect  0.120       0 0.365 
##  5 negaffect  0.104       0 0.260 
##  6 lifesat    0.0521      0 1     
##  7 easian     0.0233      0 0.0781
##  8 egend      0.0174      0 0.145 
##  9 eafric     0.0104      0 0.0429
## 10 iasian     0.00926     0 0.127 
## 11 iafric     0.00413     0 0.0380
## 12 igend      0.00209     0 0.0614
# Create intuitive labels for each month
t2.academ.sorted <- t2.academ.sorted %>% rename(MASE=MASE1_w2,MAE=mean_abs_error_w2,
                                              Nov2020=Month.7,
                                              Dec2020=Month.8,
                                              Jan2021=Month.9,
                                              Feb2021=Month.10,
                                              March2021=Month.11,
                                              April2021=Month.12)

# Create tournament length variable
t2.academ.sorted$Tournament<-"November - 6-months"

# Create intuitive labels
objective <- objective %>% rename(May2020=Month.1,
                                              June2020=Month.2,
                                              July2020=Month.3,
                                              August2020=Month.4,
                                              Sept2020=Month.5,
                                              Oct2020=Month.6,
                                              Nov2020=Month.7,
                                              Dec2020=Month.8,
                                              Jan2021=Month.9,
                                              Feb2021=Month.10,
                                              March2021=Month.11,
                                              April2021=Month.12)

objective$Tournament<-"Ground truth marker"

#examine SD
# Examine St error across 12 months
objective$stdev <- apply(objective[c("May2020","June2020","July2020",  "August2020",
                                                      "Sept2020",    "Oct2020",     "Nov2020",     "Dec2020",
                                                      "Jan2021",     "Feb2021",     "March2021",   "April2021")], 1, sd, na.rm=T)   
# Combine all and drop unnecessary variables
results<-rbind(t1.academ.sorted, t2.academ.sorted,objective) %>% 
  ungroup() %>% 
  dplyr::select(-domain,-Method.code, -(phase:revised)) 

# Arrange by Tournament and then move Domains and Tournament columns to the beginning of the dataset and all numeric variables after that.
results <- results %>% 
  arrange(Tournament) %>% 
  relocate(where(is.numeric), .after = where(is.character))


# write.csv(results,"final.results.csv")
#ANALYSES IN THIS SECTION ARE IN PART REPORTED IN THE SUPPLEMENT WENN DESCRIBING TOP PERFORMERS
#OTHER ANALYSES ARE JUST ADDED FOR AN INTERESTED READER, BUT DID NOT MAKE IT IN THE THE PAPER

pd <- position_dodge(0.7) # move them .07 to the left and right
labels<-c(
  eafric = "Exp. African\n-Am. Bias",
  easian = "Exp. Asian\n-Am. Bias",
  egend = "Exp. \nGender Bias",
  iafric = "Imp. African\n-Am. Bias",
  iasian = "Imp. Asian\n-Am. Bias",
  ideoldem = "Dem.\nSupport",
  ideolrep ="Rep.\nSupport",
  igend = "Imp.\nGender Bias",
  lifesat = "Life\nSatisfaction",
  negaffect = "Negative\nAffect",
  polar = "Polit.\nPolarization",
  posaffect = "Positive\nAffect")

#T1


#who won? - Which of the teams of academics made the best predictions for each of the domains? In other words we identify a winner for each of the domains based on the MASE score (so 12 academic winners in total) 
top.1.MASE.t1 <- phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>% 
    group_by(team_name) %>%  mutate(n_domains = n()) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 1) %>% dplyr::select(team_name,mean_abs_error_w1,n_domains,MASE1_w1,Month.1:Month.12,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%           arrange(MASE1_w1)

# write.csv(top.1.MASE.t1,"top.t1.csv")

# median MASE by domain?
median.MASE.t1 <- phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain) %>%
  group_by(domain) %>% 
  dplyr::summarize(MASE_med = median(MASE1_w1)) %>% 
  dplyr::select(domain,MASE_med) %>% 
  arrange(MASE_med)

# write.csv(median.MASE.t1,"medianMASE.t1.csv")

#examine top 5 - Top 5 winning teams for each domain based on MASE score.
top.5.MASE.t1 <- phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>%
  group_by(team_name) %>%  mutate(n_domains = n()) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 5) %>% dplyr::select(team_name,MASE1_w1,domain,compare_to_naive_linear_MASE,compare_to_naive_rwf_MASE,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,n_domains)

# Visualize top 5 winners (using MASE scores) by domain and approach.
#### Fig. S4. in manuscript
top.5.MASE.t1  %>% 
  ggplot(aes(x=domain, y=MASE1_w1, colour=Method.code)) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed',color='red',14) +                                           theme(legend.position="top") + 
  scale_colour_aaas(name="Approach") + 
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Proportion of top 5 predictions by method used across all domains
proportions(xtabs( ~ Method.code,top.5.MASE.t1))*100 #in total
## Method.code
##      Data-Driven           Hybrid Intuition/Theory 
##        61.666667         8.333333        30.000000
# Proportion of top 5 predictions by method used by domain
proportions(xtabs( ~ domain+Method.code,top.5.MASE.t1),"domain")*100 #by domain
##            Method.code
## domain      Data-Driven Hybrid Intuition/Theory
##   eafric             60     20               20
##   easian             80     20                0
##   egend              80      0               20
##   iafric             40     20               40
##   iasian            100      0                0
##   ideoldem           80      0               20
##   ideolrep           80      0               20
##   igend              80     20                0
##   lifesat            80      0               20
##   negaffect          20     20               60
##   polar              20      0               80
##   posaffect          20      0               80
# Visualize top 5 predictions for each domain as they compare to baseline models such as naive linear model and random walk. 
#### Fig. S5. in manuscript

top.5.MASE.t1  %>%
  ggplot(aes(x=domain, y=MASE1_w1, colour=compare_to_naive_linear_MASE, shape =compare_to_naive_rwf_MASE)) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") + 
  scale_colour_d3(name="Compared to\nLinear Model") +                                                     scale_shape_discrete(name="Compared to\nRandom Walk") + 
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Visualize top 5 predictions for each domain by discipline
#### Fig. S6. in manuscript

top.5.MASE.t1 %>%  
  ggplot(aes(x=domain, y=MASE1_w1, colour=discipline)) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") + 
  scale_colour_d3(name="Field") + 
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Proportion of top 5 predictions for each domain across all domains (so 60 predictions)
proportions(xtabs( ~ discipline, top.5.MASE.t1))*100 #in total
## discipline
##   Behavioral Sciences Data/Computer Science    Multi-disciplinary 
##              16.66667              30.00000              15.00000 
##       Social Sciences 
##              38.33333
# Proportion of top 5 predictions for each domain by domains
proportions(xtabs( ~ domain+discipline,top.5.MASE.t1),"domain")*100 #by domain
##            discipline
## domain      Behavioral Sciences Data/Computer Science Multi-disciplinary
##   eafric                     20                    40                  0
##   easian                     20                    40                 20
##   egend                       0                    80                 20
##   iafric                      0                    40                  0
##   iasian                      0                    60                 40
##   ideoldem                    0                     0                 40
##   ideolrep                    0                     0                 40
##   igend                       0                    80                  0
##   lifesat                    40                    20                 20
##   negaffect                  60                     0                  0
##   polar                      20                     0                  0
##   posaffect                  40                     0                  0
##            discipline
## domain      Social Sciences
##   eafric                 40
##   easian                 20
##   egend                   0
##   iafric                 60
##   iasian                  0
##   ideoldem               60
##   ideolrep               60
##   igend                  20
##   lifesat                20
##   negaffect              40
##   polar                  80
##   posaffect              60
# Visualize top 5 predictions for each domain as a function of previous forecasting experience
#### Fig. S7. in manuscript

top.5.MASE.t1 %>%  
  ggplot(aes(x=domain, y=MASE1_w1, colour=as.factor(previous_tournament.coded))) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") +                
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") + 
  scale_colour_d3(name="Prior Forecasting Experience")+ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Proportion of top 5 estimates for each domain as a function of previous forecasting experience
proportions(xtabs( ~ previous_tournament.coded,top.5.MASE.t1))*100 #in total
## previous_tournament.coded
##        0        1 
## 78.33333 21.66667
# Proportion of top 5 estimates for each domain as a function of previous forecasting experience for academics only
proportions(xtabs( ~ previous_tournament.coded,phase1 %>% filter(isExpert.factor == 'Academic') ))*100 #baserate of prior experience to compare to top 5
## previous_tournament.coded
##        0        1 
## 82.45125 17.54875
# Proportion of top 5 estimates for each domain as a function of previous forecasting experience for by domain
proportions(xtabs( ~ domain+previous_tournament.coded,top.5.MASE.t1),"domain")*100 #by domain
##            previous_tournament.coded
## domain        0   1
##   eafric     80  20
##   easian     80  20
##   egend      40  60
##   iafric     80  20
##   iasian     20  80
##   ideoldem  100   0
##   ideolrep  100   0
##   igend      80  20
##   lifesat    60  40
##   negaffect 100   0
##   polar     100   0
##   posaffect 100   0
# Visualize the size of top 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
   arrange(domain,MASE1_w1) %>% 
   group_by(domain) %>% 
   dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%
  ggplot(aes(x = domain, y = team_size.coded)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd) + 
  theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="Size of Top 10 Teams (M +/- 95%CI)")

# Visualize the complexity of the predictions for top 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>%                                                                           dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%
  ggplot(aes(x = domain, y = Method.complex)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") +
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="Model complexity (M +/- 95%CI)")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_summary()`).

# Visualize percentage of females in each team for top 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>%
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>%                                                                           dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US ) %>%
  ggplot(aes(x = domain, y = team_gender)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") +
  scale_x_discrete(labels=labels, name="")+
  labs(colour = "Approach",fill="Approach", x="",y="% Female per Team (M +/- 95%CI)")

# Visualize education for top 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US ) %>%
  ggplot(aes(x = domain, y = team_education)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y=" (M +/- 95%CI)")

# Visualize team age for top 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>%group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>%                                                                           dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US ) %>%
  ggplot(aes(x = domain, y = team_Age)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") +scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="% Average Team Age (M +/- 95%CI)")
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_summary()`).

# Visualize percentage of non-us team members for 10 teams by domain for academics only
phase1 %>% 
  filter(isExpert.factor == 'Academic')  %>%
  arrange(domain,MASE1_w1) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>%
  dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US )%>%
  ggplot(aes(x = domain, y = non_US)) +
  stat_summary(fun.data="mean_cl_boot", position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="% Non-US per Team (M +/- 95%CI)")

## comparison to lay people

## Compare performance to naive RW between academics and lay people across domains.
proportions(xtabs( ~ compare_to_naive_rwf_MASE+isExpert.factor,phase1),"isExpert.factor")*100 #
##                          isExpert.factor
## compare_to_naive_rwf_MASE Academic Prolific
##         Above Random Walk 13.09192 30.87935
##         Below Random Walk 86.90808 69.12065
## Compare performance to naive RW between academics and lay people across domains using chi squared test
chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+isExpert.factor,phase1))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  xtabs(~compare_to_naive_rwf_MASE + isExpert.factor, phase1)
## X-squared = 45.002, df = 1, p-value = 0.00000000001968
## Compare performance to naive RW by method across domains using chi squared test
chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+Method.code,subset(phase1, compare_to_naive_rwf_MASE!="Equal to Naive rwf"))) #exclude equal as it is negligible and screws up calculation
## Warning in chisq.test(xtabs(~compare_to_naive_rwf_MASE + Method.code,
## subset(phase1, : Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_rwf_MASE + Method.code, subset(phase1,     compare_to_naive_rwf_MASE != "Equal to Naive rwf"))
## X-squared = 56.203, df = 5, p-value = 0.0000000000738
## Compare performance to naive linear between academics and lay people across domains
proportions(xtabs( ~ compare_to_naive_linear_MASE+isExpert.factor,phase1),"isExpert.factor")*100 #
##                             isExpert.factor
## compare_to_naive_linear_MASE Academic Prolific
##           Above Naive linear 68.52368 75.11929
##           Below Naive linear 31.47632 24.88071
## Compare performance to naive linear between academics and lay people across domains using chi square test.
chisq.test(xtabs( ~ compare_to_naive_linear_MASE+isExpert.factor,phase1))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  xtabs(~compare_to_naive_linear_MASE + isExpert.factor, phase1)
## X-squared = 6.1558, df = 1, p-value = 0.0131
## Comparison by method among academics 

## Compare performance against Naive RW by method
proportions(xtabs( ~ compare_to_naive_rwf_MASE+Method.code, phase1), "Method.code")*100 #
##                          Method.code
## compare_to_naive_rwf_MASE Data-Driven Ground Truth    Hybrid Intuition/Theory
##         Above Random Walk    8.196721              26.923077        16.666667
##         Below Random Walk   91.803279              73.076923        83.333333
##                          Method.code
## compare_to_naive_rwf_MASE Lay People Naive-linear Naive-rfw
##         Above Random Walk  30.879346     8.333333  8.333333
##         Below Random Walk  69.120654    91.666667 91.666667
chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+Method.code,phase1))
## Warning in chisq.test(xtabs(~compare_to_naive_rwf_MASE + Method.code, phase1)):
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_rwf_MASE + Method.code, phase1)
## X-squared = NaN, df = 6, p-value = NA
#chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+Method.code,subset(phase1, compare_to_naive_rwf_MASE!="Equal to Naive rwf")))

## Compare performance against Naive RW by method - just for academics
chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+Method.code,phase1_exp))
## Warning in chisq.test(xtabs(~compare_to_naive_rwf_MASE + Method.code,
## phase1_exp)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_rwf_MASE + Method.code, phase1_exp)
## X-squared = 9.9103, df = 2, p-value = 0.007047
#chisq.test(xtabs( ~ compare_to_naive_rwf_MASE+Method.code,subset(phase1_exp, compare_to_naive_rwf_MASE!="Equal to Naive rwf")))

# Compare performance against Naive linear by method
proportions(xtabs( ~ compare_to_naive_linear_MASE+Method.code,phase1),"Method.code")*100 #
##                             Method.code
## compare_to_naive_linear_MASE Data-Driven Ground Truth   Hybrid Intuition/Theory
##           Above Naive linear    62.29508              73.07692         75.33333
##           Below Naive linear    37.70492              26.92308         24.66667
##                             Method.code
## compare_to_naive_linear_MASE Lay People Naive-linear Naive-rfw
##           Above Naive linear   75.11929                       
##           Below Naive linear   24.88071
# Compare performance against Naive linear by method using chi square
chisq.test(xtabs( ~ compare_to_naive_linear_MASE+Method.code,phase1))
## Warning in chisq.test(xtabs(~compare_to_naive_linear_MASE + Method.code, :
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_linear_MASE + Method.code, phase1)
## X-squared = NaN, df = 6, p-value = NA
# Compare performance against Naive linear by method using chi square just academics
chisq.test(xtabs( ~ compare_to_naive_linear_MASE+Method.code,phase1_exp)) #just comparison of academics
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_linear_MASE + Method.code, phase1_exp)
## X-squared = 6.7664, df = 2, p-value = 0.03394
##PHASE 2

#who won? - identify top performers for each domain using MASE scores in phase 2.
top.1.MASE.t2 <- academic_only  %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain, MASE1_w2) %>% 
  group_by(team_name) %>%  mutate(n_domains = n()) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 1) %>%                                                  dplyr::select(domain,mean_abs_error_w2,n_domains,MASE1_w2,team_name,mean_abs_percent_error_w2,compare_to_naive_linear_MASE_w2,compare_to_naive_rwf_MASE,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,phase,revised)
  
# write.csv(top.1.MASE.t2,"top.t2.csv")

#median MASE by domain? - compute median accuracy by domain for phase 2.
median.MASE.t2 <- academic_only  %>% 
  filter(!(phase == 1 & revised == 1)) %>%
  arrange(domain) %>% 
  group_by(domain) %>% 
  dplyr::summarize(MASE_med = median(MASE1_w2)) %>% 
  dplyr::select(domain,MASE_med) %>% 
  arrange(MASE_med)

#write.csv(median.MASE.t2,"medianMASE.t2.csv")

#examine top 5 - top 5 performers by domain using MASE scores for academics in phase 2.
top.5.MASE.t2 <- academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
    group_by(team_name) %>%  mutate(n_domains = n()) %>% 
   group_by(domain) %>% 
  dplyr::slice_head(n = 5) %>% dplyr::select(team_name,MASE1_w2,domain,n_domains,compare_to_naive_linear_MASE_w2,compare_to_naive_rwf_MASE,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,phase,revised)
  
# Visualize top 5 performers as a function of method used.
#### Fig. S8. in manuscript

top.5.MASE.t2 %>%  
  ggplot(aes(x=domain, y=MASE1_w2, colour=Method.code)) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") +
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") +
  scale_colour_aaas(name="Approach") +
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# proportion of different methods used across the top 5 performance across all domains. 
proportions(xtabs( ~ Method.code,top.5.MASE.t2))*100 #in total
## Method.code
##      Data-Driven           Hybrid Intuition/Theory 
##        65.000000         6.666667        28.333333
# proportion of different methods used across the top 5 performance by domain
proportions(xtabs( ~ domain+Method.code,top.5.MASE.t2),"domain")*100 #by domain
##            Method.code
## domain      Data-Driven Hybrid Intuition/Theory
##   eafric             60     20               20
##   easian             80      0               20
##   egend             100      0                0
##   iafric            100      0                0
##   iasian             60     20               20
##   ideoldem          100      0                0
##   ideolrep           40     20               40
##   igend              60      0               40
##   lifesat            80      0               20
##   negaffect          20     20               60
##   polar              60      0               40
##   posaffect          20      0               80
# Top 5 Performance compared against naive linear and random walk by domain.
#### Fig. S9. in manuscript

top.5.MASE.t2 %>% 
  ggplot(aes(x=domain, y=MASE1_w2, colour=compare_to_naive_linear_MASE_w2, shape =compare_to_naive_rwf_MASE )) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top")+
  scale_colour_d3(name="Compared to\nLinear Model") + 
  scale_shape_discrete(name="Compared to\nRandom Walk") +
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Top 5 Performance by discipline and domain.
#### Fig. S10. in manuscript

top.5.MASE.t2 %>%  
  ggplot(aes(x=domain, y=MASE1_w2, colour=discipline)) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") + 
  scale_colour_d3(name="Field") + 
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Proportion of top 5 performers by discipline across all domains
proportions(xtabs( ~ discipline,top.5.MASE.t2))*100 #in total
## discipline
##   Behavioral Sciences Data/Computer Science    Multi-disciplinary 
##              18.33333              15.00000              15.00000 
##       Social Sciences 
##              51.66667
# Proportion of top 5 performers by discipline and domain
proportions(xtabs( ~ domain+discipline,top.5.MASE.t2),"domain")*100 #by domain
##            discipline
## domain      Behavioral Sciences Data/Computer Science Multi-disciplinary
##   eafric                     20                    20                 20
##   easian                     20                     0                  0
##   egend                       0                    40                 20
##   iafric                     20                    20                  0
##   iasian                     20                     0                 20
##   ideoldem                    0                    40                  0
##   ideolrep                    0                    20                 20
##   igend                      20                    20                  0
##   lifesat                    20                     0                 20
##   negaffect                  60                    20                 20
##   polar                       0                     0                 60
##   posaffect                  40                     0                  0
##            discipline
## domain      Social Sciences
##   eafric                 40
##   easian                 80
##   egend                  40
##   iafric                 60
##   iasian                 60
##   ideoldem               60
##   ideolrep               60
##   igend                  60
##   lifesat                60
##   negaffect               0
##   polar                  40
##   posaffect              60
# Performance of top 5 by prior forecasting experience and domain.
#### Fig. S11. in manuscript

top.5.MASE.t2  %>% 
  ggplot(aes(x=domain, y=MASE1_w2, colour=as.factor(previous_tournament.coded))) +  
  geom_point(size=3, position=pd, alpha = .5) + 
  scale_x_discrete(labels=labels, name="") + 
  geom_hline(yintercept =1, linetype='dashed', color='red', 14) +                                         theme(legend.position="top") + 
  scale_colour_d3(name="Prior Forecasting Experience") + 
  ylab("MASE")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

# Proportion of top 5 who had previous tournament experience across all domains
proportions(xtabs( ~ previous_tournament.coded,top.5.MASE.t2))*100 #in total
## previous_tournament.coded
##        0        1 
## 78.33333 21.66667
# Proportion of top 5 who had previous tournament experience - academics only
proportions(xtabs( ~ previous_tournament.coded,academic_only%>% filter(!(phase == 1 & revised == 1))))*100 #baserate of prior experience to compare to top 5
## previous_tournament.coded
##        0        1 
## 84.98168 15.01832
# Proportion of top 5 who had previous tournament experience by domain
proportions(xtabs( ~ domain+previous_tournament.coded,top.5.MASE.t2),"domain")*100 #by domain
##            previous_tournament.coded
## domain        0   1
##   eafric     60  40
##   easian     80  20
##   egend      80  20
##   iafric     60  40
##   iasian     60  40
##   ideoldem   80  20
##   ideolrep   80  20
##   igend     100   0
##   lifesat    80  20
##   negaffect 100   0
##   polar      60  40
##   posaffect 100   0
# Size of top ten teams by domain
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>%
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w2,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%
  ggplot(aes(x = domain, y = team_size.coded)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="Size of Top 10 Teams (M +/- 95%CI)")

# Top 10 teams model complexity by domain.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%
  ggplot(aes(x = domain, y = Method.complex)) +
  stat_summary(fun.data="mean_cl_boot", position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="Model complexity (M +/- 95%CI)")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_summary()`).

# Top 5 teams model complexity by domain.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 5) %>% dplyr::select(team_name,MASE1_w1,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise) %>%
  ggplot(aes(x = domain, y = Method.complex)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd) + 
  theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="Model complexity (M +/- 95%CI)") #same as for top 10

# %of females per top 10 winning teams by domain.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w2,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US )%>%
  ggplot(aes(x = domain, y = team_gender)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="")+
  labs(colour = "Approach",fill="Approach", x="",y="% Female per Team (M +/- 95%CI)")

# % of non-Phds per team.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w2,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US )%>%
  ggplot(aes(x = domain, y = team_education)) +
  stat_summary(fun.data="mean_cl_boot", position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") +scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="% Non_PHD per Team (M +/- 95%CI)")

# Average team age for top 10 teams by domain.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>% 
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>%   dplyr::select(team_name,MASE1_w2,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US )%>%
  ggplot(aes(x = domain, y = team_Age)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="") +
  labs(colour = "Approach",fill="Approach", x="",y="% Average Team Age (M +/- 95%CI)")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_summary()`).

# % of non us individuals on top 10 teams by domain.
academic_only %>% 
  filter(!(phase == 1 & revised == 1)) %>% 
  arrange(domain,MASE1_w2) %>%
  group_by(domain) %>% 
  dplyr::slice_head(n = 10) %>% dplyr::select(team_name,MASE1_w2,domain,team_size.coded,discipline,previous_tournament.coded,Method.code,model,theory,numpred,parameters,Method.complex,team_expertise,team_gender,team_education,team_Age,non_US) %>%
  ggplot(aes(x = domain, y = non_US)) +
  stat_summary(fun.data="mean_cl_boot",  position=pd)+theme_minimal(base_size = 14) +
  theme(legend.position="bottom") + 
  scale_x_discrete(labels=labels, name="")+
  labs(colour = "Approach",fill="Approach", x="",y="% Non-US per Team (M +/- 95%CI)")

## Comparison by method among academics

# Percentage of academics who performed above, below, or just as well as naive rwf by method type across all domains.
proportions(xtabs( ~ compare_to_naive_rwf_MASE_w2+Method.code,academic_only %>% 
                     filter(!(phase == 1 & revised == 1))),"Method.code")*100 #
##                             Method.code
## compare_to_naive_rwf_MASE_w2 Data-Driven     Hybrid Intuition/Theory
##           Above Naive rwf     63.7931034 71.1111111       70.6161137
##           Below Naive rwf     35.8620690 28.8888889       29.3838863
##           Equal to Naive rwf   0.3448276  0.0000000        0.0000000
# Chi square test of the proportions above while also dropping equal to.
chisq.test(xtabs( ~ compare_to_naive_rwf_MASE_w2+Method.code,subset(academic_only%>% filter(!(phase == 1 & revised == 1)), compare_to_naive_rwf_MASE_w2!="Equal to Naive rwf"))) #exclude equal as it is negligible and screws up calculation
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_rwf_MASE_w2 + Method.code, subset(academic_only %>%     filter(!(phase == 1 & revised == 1)), compare_to_naive_rwf_MASE_w2 !=     "Equal to Naive rwf"))
## X-squared = 2.7581, df = 2, p-value = 0.2518
# Percentage of academics who performed above, below, or just as well as naive linear by method type across all domains
proportions(xtabs( ~ compare_to_naive_linear_MASE_w2+Method.code,academic_only%>% filter(!(phase == 1 & revised == 1))),"Method.code")*100 #
##                                Method.code
## compare_to_naive_linear_MASE_w2 Data-Driven   Hybrid Intuition/Theory
##              Above Naive linear    61.03448 64.44444         64.92891
##              Below Naive linear    38.96552 35.55556         35.07109
# Chi square test of proportions above.
chisq.test(xtabs( ~ compare_to_naive_linear_MASE_w2+Method.code,academic_only%>% filter(!(phase == 1 & revised == 1))))
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~compare_to_naive_linear_MASE_w2 + Method.code, academic_only %>%     filter(!(phase == 1 & revised == 1)))
## X-squared = 0.84847, df = 2, p-value = 0.6543
#examine if top 5 in T1 are also among top 5 in t2
top.5.MASE.t1$phase<-"T1"
top.5.MASE.t1$MASE<-top.5.MASE.t1$MASE1_w1
top.5.MASE.t2$phase<-"T2"
top.5.MASE.t2$MASE<-top.5.MASE.t2$MASE1_w2

top.5.MASE<-rbind(top.5.MASE.t1,top.5.MASE.t2) 
top.5.MASE%>% dplyr::select(team_name,MASE,domain,phase,model,revised) %>% group_by(domain) %>% count(team_name)
## # A tibble: 114 × 3
## # Groups:   domain [12]
##    domain team_name                       n
##    <chr>  <chr>                       <int>
##  1 eafric BlackSwan                       1
##  2 eafric Compassionate Values            2
##  3 eafric FMTeam                          1
##  4 eafric LongerSquigglyLines             1
##  5 eafric MLTS                            1
##  6 eafric Northstar                       1
##  7 eafric The Well-Adjusted R Squares     1
##  8 eafric Tosbik                          1
##  9 eafric fearfulastra                    1
## 10 easian AbCdEfG                         1
## # ℹ 104 more rows
#only in five out of 12 domains one top team from the first tournament appeared among the top five teams of a given domain in the second tournament: 
#Compassionate Values for Explicit African American bias; fearfulastra for Explicit Gender-Career bias; FMTeam for Implicit Asian American bias; AbCdEfG for Ideological Support of Democrats; A Woman Scientist for Negative Sentiment; NYHC for political polarization. The remaining top five teams were unique across tournaments. 

#examine consistency across domains in each tournament
top5.repeats.t1<-top.5.MASE.t1%>% dplyr::select(team_name,MASE,domain,n_domains) %>% group_by(team_name) %>% count(team_name) %>% arrange(desc(n))
psych::describe(top5.repeats.t1)#14 appear more than once; but M is small = 1.62
##            vars  n  mean    sd median trimmed   mad min max range skew kurtosis
## team_name*    1 37 19.00 10.82     19   19.00 13.34   1  37    36  0.0    -1.30
## n             2 37  1.62  0.92      1    1.48  0.00   1   4     3  1.2     0.18
##              se
## team_name* 1.78
## n          0.15
top5.repeats.t1.perc<-top5.repeats.t1 %>% left_join(top.5.MASE.t1 %>% dplyr::select(team_name,n_domains) )
## Adding missing grouping variables: `domain`
## Joining with `by = join_by(team_name)`
top5.repeats.t1.perc$perc<-top5.repeats.t1.perc$n/top5.repeats.t1.perc$n_domains*100
print(top5.repeats.t1.perc) # one team among those who were among the top five in more than 2 domains had a reasonably small number of domains they made predictions about (6), such that in 4 out of 6 = 67% they were in the top five. For the rest, the number of domains they were in the top five were below half of those they made predictions for.   
## # A tibble: 60 × 5
## # Groups:   team_name [37]
##    team_name                       n domain   n_domains  perc
##    <chr>                       <int> <chr>        <int> <dbl>
##  1 The Well-Adjusted R Squares     4 eafric           6  66.7
##  2 The Well-Adjusted R Squares     4 easian           6  66.7
##  3 The Well-Adjusted R Squares     4 iafric           6  66.7
##  4 The Well-Adjusted R Squares     4 igend            6  66.7
##  5 fearfulastra                    4 eafric          12  33.3
##  6 fearfulastra                    4 egend           12  33.3
##  7 fearfulastra                    4 iafric          12  33.3
##  8 fearfulastra                    4 igend           12  33.3
##  9 Bluebirds                       3 easian          11  27.3
## 10 Bluebirds                       3 ideoldem        11  27.3
## # ℹ 50 more rows
top5.repeats.t2<-top.5.MASE.t2%>% dplyr::select(team_name,MASE,domain,n_domains) %>% group_by(team_name) %>% count(team_name) %>% arrange(desc(n))
psych::describe(top5.repeats.t2)#17 appear more than once; but M is small = 1.67
##            vars  n  mean    sd median trimmed   mad min max range skew kurtosis
## team_name*    1 36 18.50 10.54   18.5    18.5 13.34   1  36    35 0.00    -1.30
## n             2 36  1.67  0.93    1.0     1.5  0.00   1   5     4 1.73     3.18
##              se
## team_name* 1.76
## n          0.15
top5.repeats.t2.perc<-top5.repeats.t2 %>% left_join(top.5.MASE.t2 %>% dplyr::select(team_name,n_domains) )
## Adding missing grouping variables: `domain`
## Joining with `by = join_by(team_name)`
top5.repeats.t2.perc$perc<-top5.repeats.t2.perc$n/top5.repeats.t2.perc$n_domains*100
print(top5.repeats.t2.perc) # one team among those who were among the top five in more than 2 domains had a reasonably small number of domains they made predictions about (9), such that in 5 out of 9 = 55.56% they were in the top five. For the rest, the number of domains they were in the top five were at/below half of those they made predictions for.   
## # A tibble: 60 × 5
## # Groups:   team_name [36]
##    team_name                  n domain   n_domains  perc
##    <chr>                  <int> <chr>        <int> <dbl>
##  1 AbCdEfG                    5 easian           9  55.6
##  2 AbCdEfG                    5 iafric           9  55.6
##  3 AbCdEfG                    5 iasian           9  55.6
##  4 AbCdEfG                    5 ideoldem         9  55.6
##  5 AbCdEfG                    5 igend            9  55.6
##  6 Polarization Disciples     4 easian          12  33.3
##  7 Polarization Disciples     4 egend           12  33.3
##  8 Polarization Disciples     4 iafric          12  33.3
##  9 Polarization Disciples     4 iasian          12  33.3
## 10 adamman                    3 easian          12  25  
## # ℹ 50 more rows

Visualize historical results

#THE SECTION BELOW IS CHIEFLY FOR INFORMATION ABOUT THE HISTORICAL TRENDS, AND VISUALLY INSPECTING VARIABILITY IN TRENDS, AS OUTLINED IN ONE SENTENCE IN THE DISCUSSION OF THE PAPER.

# Read in historical data
historical <- read.csv("historical_data.csv")
historical_tsbl <- as_tsibble(historical, index = Month)
labels<-c(
  eafric = "Exp. Bias Vs. Afr.-Am\nhigher=stereotype-consistent\n(-3 to +3)",
  easian = "Exp. Bias Vs. Asian.-Am\nhigher=stereotype-consistent\n(-3 to +3)",
  egend = "Exp. Bias Vs. Women-Career\nhigher=stereotype-consistent\n(-3 to +3)",
  iafric = "Imp. Bias Vs. Afr.-Am.\nhigher=stereotype-consistent\n(IAT D score)",
  iasian = "Imp. Bias Vs. Asian.-Am.\nhigher=stereotype-consistent\n(IAT D score)",
  ideoldem = "Democratic Support\n(% Population)",
  ideolrep ="Republican Support\n(% Population)",
  igend = "Imp. Bias Vs. Women-Career\nhigher=stereotype-consistent\n(IAT D score)",
  lifesat = "Life Satisfaction\nCantril ladder\n(0-10 scale)",
  negaffect = "Negative Affect\nstandardized Vs. historical M/SD\n(z-score)",
  polar = "Polit. Polarization\n% of Rep. Vs. Dem. approvals\n(absolute difference score) ",
  posaffect = "Positive Affect\n(z-score)")

#calculate SD in historical data for the last 12 months before the pandemic.
historical %>% filter(Month < 0 & Month >-13) %>% dplyr::summarise_at(vars(negaffect:polar), sd, na.rm = TRUE)
##   negaffect posaffect     eafric     easian     egend      iafric     iasian
## 1 0.1288827 0.1937484 0.02861003 0.06444547 0.0570047 0.003186042 0.02176622
##        igend    lifesat ideoldem ideolrep    polar
## 1 0.01412354 0.03888933   2.3063 1.950518 2.016504
# Actual change based on survey results for each domain.
actual_change_plot<-historical_tsbl %>% 
  pivot_longer(negaffect:polar,names_to="Domain",values_to="Score") %>% mutate(Domain = factor(Domain,      # Reordering group factor levels
                         levels = c("egend","easian","eafric",
                                    "igend","iasian","iafric",
                                    "posaffect","negaffect","lifesat",
                                    "polar","ideoldem","ideolrep"))) %>% 
  ggplot(aes(x = Month, y = Score, colour = Domain))+
 # geom_smooth(aes(x = Month, y = Score, colour = Domain),method = "loess") +
  geom_line()+
  theme_minimal(base_size = 14) +geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_point()+
  # Shade area above x = 1 (after start of the tournament)
  geom_rect(aes(ymin = -Inf, ymax = Inf, xmin = 1, xmax = Inf),
            alpha = 1/200,
            fill = "blue")+
  facet_wrap(~Domain, scales = "free_y", nrow = 4, labeller=labeller(Domain=labels))+
   theme(legend.position="none") +scale_x_continuous(breaks=c(-39:12), labels =c("Jan","","","","May","","","","Sep","","","","Jan","","","","May","","",
                                                                                 "","Sep","","","","Jan","","","","May","","","","Sep","","","","Jan",
                                                                                 "","","","May","","","","Sep","","","","Jan","","","Apr"))+  theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))+
  labs(x="",y="Estimate") 
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
actual_change_plot
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Historical long

#reorder levels of the domains
dat_long$domain <- factor(dat_long$domain,      # Reordering group factor levels
                         levels = c("egend","easian","eafric",
                                    "igend","iasian","iafric",
                                    "posaffect","negaffect","lifesat",
                                    "polar","ideoldem","ideolrep"))


## Convert data.frame into a tibble long format.
hist_long <- as_tibble(historical_tsbl) %>%
  pivot_longer(negaffect:polar,names_to="domain", values_to="Score")
#examine SD of all domains for historical data

#THIS SECTION BELOW COMPUTES MARKERS OF COMPLEXITY FOR THE TOURNAMENT, INCLUDING SD, MAD, AND AN SUPPLEMENTARY METRIC OF PERMUTATION ENTROPY

# Compute sd, mad, and entropy bt domain
hist_var_w1 <- as_tibble(historical_tsbl) %>% 
    subset(Month < 0) %>% 
    pivot_longer(negaffect:polar,names_to="domain",values_to="Score") %>%
    dplyr::select(domain,Score) %>% 
    group_by(domain) %>% 
    summarise(sd_hist_w1 = sd(Score), mad_hist_w1 = mad(Score), perp_entropy_hist_w1=permutation_entropy(Score))


tournament1_var <- 
  as_tibble(historical_tsbl) %>% 
  subset(Month > 0) %>% 
  pivot_longer(negaffect:polar,names_to="domain",values_to="Score") %>%
  dplyr::select(domain,Score) %>% 
  na.omit %>% 
  group_by(domain) %>% 
  summarise(sd_w1 = sd(Score), mad_w1 = mad(Score), perp_entropy_w1=permutation_entropy(Score))

hist_var_w2 <- as_tibble(historical_tsbl) %>% 
  subset(Month < 7) %>% 
  pivot_longer(negaffect:polar,names_to="domain",values_to="Score") %>%
  dplyr::select(domain,Score) %>% 
  group_by(domain) %>% 
  summarise(sd_hist_w2 = sd(Score), mad_hist_w2 = mad(Score), perp_entropy_hist_w2=permutation_entropy(Score))

tournament2_var <- as_tibble(historical_tsbl) %>% 
  subset(Month > 6) %>% 
  pivot_longer(negaffect:polar,names_to="domain",values_to="Score") %>%
  dplyr::select(domain,Score) %>% 
  na.omit%>%group_by(domain) %>% 
  summarise(sd_w2 = sd(Score), mad_w2 = mad(Score), perp_entropy_w2=permutation_entropy(Score))

complexity <- hist_var_w1 %>% 
  left_join(tournament1_var) %>%
  left_join(hist_var_w2) %>%
  left_join(tournament2_var)
## Joining with `by = join_by(domain)`
## Joining with `by = join_by(domain)`
## Joining with `by = join_by(domain)`

Visualizations

Visualizations of predictions across domains

#do by method (among experts now)
#reorder levels of the domains
dat_long$domain <- factor(dat_long$domain,      # Reordering group factor levels
                         levels = c("egend","easian","eafric",
                                    "igend","iasian","iafric",
                                    "posaffect","negaffect","lifesat",
                                    "polar","ideoldem","ideolrep"))

#get ground truth markers (subset)
dat_long$Month0<-dat_long$Month-1

objective<-as.data.frame(subset(dat_long,phase == 1 & !is.na(Method.code)& Method.code=="Ground Truth"))

#get subset for supplementary analyses, not in the paper(!), focusing on value.dif column i -  absolute percent deviation for each predicted Month

dat_long_phase1<-dat_long %>%subset(phase == 1 & Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw")
dat_long_phase1$Method.code <- relevel(factor(dat_long_phase1$Method.code), "Lay People") #use lay people as a reference group

#updates for the coding of categories
phase1$Method.code <- relevel(factor(phase1$Method.code), "Lay People") #use lay people as a reference group
phase1_exp$updated<-ifelse(phase1_exp$revised==1,"update","no update")
phase1$compare_to_naive_rwf_MASE.update<-ifelse(phase1$compare_to_naive_rwf_MASE!="Equal to Naive rwf",phase1$compare_to_naive_rwf_MASE,ifelse(phase1$compare_to_naive_rwf_MASE=="Equal to Naive rwf","Below Naive rwf",NA))
phase1_exp$teamS<-as.factor(ifelse(phase1_exp$team_size.coded>=6,3,ifelse(phase1_exp$team_size.coded<6&phase1_exp$team_size.coded>1,2,ifelse(phase1_exp$team_size.coded==1,1,NA))))
phase1_exp$is_multidisciplinary<-ifelse(phase1_exp$discipline=="Multi-disciplinary",1,0)
phase1_exp$objectivexpert<-ifelse(phase1_exp$pub==1,"Expert",ifelse(phase1_exp$pub==2,"Non Expert",NA))

#add historical variability data (as extra variable)
phase1_exp<-complexity %>% left_join(phase1_exp)
## Joining with `by = join_by(domain)`
#count how many domains per person
phase1_exp<-phase1_exp %>%group_by(team_name) %>% 
 mutate(n_domains = n())

#Supplementary analyses NOT in the paper: For models evaluating accuracy of individual time points, we will use forecasting type (purely theoretical, purely data-driven and hybrid models), forecasting domain and time points as predictors, with absolute percent deviation scores nested within teams. 

dat_long_phase1$teamS<-as.factor(ifelse(dat_long_phase1$team_size.coded>=6,3,ifelse(dat_long_phase1$team_size.coded<6&dat_long_phase1$team_size.coded>1,2,ifelse(dat_long_phase1$team_size.coded==1,1,NA))))
dat_long_phase1$is_multidisciplinary<-ifelse(dat_long_phase1$discipline=="Multi-disciplinary",1,0)
dat_long_phase1$objectivexpert<-ifelse(dat_long_phase1$pub==1,"Expert",ifelse(dat_long_phase1$pub==2,"Non Expert",NA))

###############################################################
#graph individual predictions (supplementary, NOT in the paper)
#BEGINNING
###############################################################


graph_indiv_pred<-dat_long %>% subset(phase == 1 & !is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw") %>% 
   ggplot(aes(x = Month0, y = value, colour = Method.code, fill=Method.code))+
  geom_smooth(aes(x = Month0, y = value, colour = Method.code, fill=Method.code),method = "loess") +  
  facet_wrap(~domain, scales = "free", nrow = 3, labeller=labeller(domain=labels))+theme_minimal(base_size = 14) +
  geom_smooth(data=objective,se=F) + #here we add the ground truth markers without confidence band
   theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+ 
  labs(colour = "Sample",fill="Sample", x="Months (from May 2021)",y="Estimate (M +/- 95%CI)")

graph_indiv_pred
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

##without any benchmarks
dat_long$GT[dat_long$Method.code!="Ground Truth"]<-"Forecasting Estimate"
dat_long$GT[dat_long$Method.code=="Ground Truth"]<-"Ground Truth"
objective$GT<-objective$Method.code

graph_indiv_pred_nobechmarks<-dat_long %>% subset(phase == 1 & !is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People") %>% 
   ggplot(aes(x = Month0, y = value, colour = GT, fill=GT))+
  geom_smooth(aes(x = Month0, y = value, colour = GT, fill=GT),method = "loess") +  
  facet_wrap(~domain, scales = "free", nrow = 3, labeller=labeller(domain=labels))+theme_minimal() +
  geom_smooth(data=objective,se=F) + #here we add the ground truth markers without confidence band
   theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+ 
  labs(colour = "",fill="", x="Months (from May 2021)",y="Estimate (M +/- 95%CI)")

graph_indiv_pred_nobechmarks
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

hist_long$Domain<-hist_long$domain
hist_long$GT<-"Ground Truth"
hist_long$Month0<-hist_long$Month-1
hist_long$value<-hist_long$Score

dat_longX<-dat_long %>% subset(phase == 1 & !is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People") #to get extra scores for indiv point visualization.
dat_longX$GT<-"Forecasts from\nIndiv. Teams"

dat_long$GT[dat_long$Method.code!="Ground Truth"]<-"Forecasting Estimate"

####HERE IS THE GRAPH OF HISTORICAL< AND SCIENTIST FORECASTS, ALONG WITH LOWESS CURVE OF LAY PEOPLE IN TOURNAMENT 1
FIGURE_2<-dat_long %>% subset(phase == 1 & !is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People") %>% 
   ggplot(aes(x = Month, y = value))+ theme_minimal() +
      geom_line(data=subset(dat_longX,phase==1),aes(x = Month, group=team_name), alpha=.4, colour="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
  facet_wrap(~domain, scales = "free", nrow = 4, labeller=labeller(domain=labels))+
  geom_line(data=hist_long) +   geom_point(data=hist_long) + #here we add the ground truth markers
  geom_smooth(data=dat_long %>% subset(Method.code=="Lay People"),aes(x = Month, y = value, colour = "salmon", fill="salmon"),method = "loess")+ #lay people
    geom_smooth(aes(x = Month, y = value, colour = "blue", fill="blue"),method = "loess") +  #academics
  scale_x_continuous(breaks=c(-39:12), labels =c("Jan","","","","May","","","","Sep","","","","Jan","","","","May","","","","Sep","","","","Jan","","","","May","","","","Sep","","","","Jan","","","","May","","","","Sep","","","","Jan","","","Apr"))+ 
  guides(fill = "none")+
    scale_color_manual(labels = c("Scientists", "Naive Crowd"), values=c(blue="blue",salmon="salmon"))+
      scale_fill_manual(labels = c("Scientists", "Naive Crowd"), values=c(blue="blue",salmon="salmon"))+
  labs(colour = "",fill="", x="",y="Estimate (M +/- 95%CI)")+
    theme(legend.position="bottom",axis.text.x = element_text(vjust=.5, hjust=1, size=rel(0.7)), panel.grid.minor.x = element_blank(),legend.title = element_blank())

FIGURE_2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

###############################################################
#graph individual predictions (supplementary, NOT in the paper)
#END
###############################################################

###############################################################
#graph individual predictions and ground truth markers - FIGURE 1 IN THE SUPPLEMENT in the PAPER)
#BEGINNING
##this one includes creating subsets, historical data subsets, and designing sub-plots of various caliber for the paper, and for presentations and putting the subplots together
###############################################################

#combine with the phase 2 data.
dat_long_phase2X<-dat_long %>%filter(!(phase == 1 & revised == 1)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" & Month %in% c(7,8,9,10,11,12))

dat_long$GT[dat_long$Method.code!="Ground Truth"]<-"Aggregate Estimate\n(lowess)"
objective$value
##   [1]  6.3336660  6.2174470  6.3044130  6.3270050  6.3362940  6.3384310
##   [7]  6.3313540  6.3001370  6.3488340  6.3472190  6.3302940  6.3399140
##  [13] -1.4074900 -1.6616760 -1.4339360 -1.4248740 -1.4449580 -1.4672450
##  [19] -1.4776450 -1.3675900 -1.4136990 -1.1801190 -1.1777950 -1.1386660
##  [25]  1.5180400  2.2292320  1.9407140  1.9813260  2.0475030  2.1800290
##  [31]  2.2349600  1.8477990  1.9663210  1.4173570  1.3220850  1.1617310
##  [37] 47.4302300 48.2066700 48.2502300 48.9238900 49.2315600 50.1355700
##  [43] 45.8000000 46.0000000 44.4285700 46.0000000 44.2500000 41.1767400
##  [49] 41.2822200 41.4497700 41.8140700 43.1184400 42.7895700 45.6500000
##  [55] 48.0000000 40.8571400 42.0000000 40.5000000 78.0000000 84.5000000
##  [61] 87.0000000 85.0000000 87.5000000 91.0000000 87.0000000 81.0000000
##  [67] 82.5000000 84.0000000 86.0000000 83.0000000  0.3611811  0.3266711
##  [73]  0.3447837  0.3510517  0.3464791  0.3497295  0.3544105  0.3506364
##  [79]  0.3664376  0.3473538  0.3444180  0.3348229 -0.1908142 -0.2467328
##  [85] -0.1923643 -0.1488679 -0.1749712 -0.1724460 -0.1785698 -0.1380645
##  [91] -0.1649681 -0.1558643 -0.1843981 -0.2370465  0.2751396  0.2560401
##  [97]  0.2731962  0.2721085  0.2778844  0.2864830  0.2807626  0.2795000
## [103]  0.2769582  0.2822287  0.2824247  0.2819609 -0.1768523 -0.2204723
## [109] -0.2256391 -0.1952333 -0.2096647 -0.1937306 -0.2038700 -0.1771797
## [115] -0.1864452 -0.2117938 -0.2048987 -0.2168682  0.3469385  0.3537830
## [121]  0.3556234  0.3526524  0.3557363  0.3443210  0.3448016  0.3389993
## [127]  0.3475489  0.3587705  0.3547804  0.3537874  0.8522458  0.8079851
## [133]  0.7954372  0.7906953  0.8033877  0.8229499  0.7808411  0.7461424
## [139]  0.7890630  0.7364855  0.7524200  0.7922046
dat_long$phaseF[dat_long$phase==1]<-"First Tournament\n(May 2020)"
dat_long$phaseF[dat_long$phase==2]<-"Follow-up Tournament\n(Nov 2020)" 

objective$phaseF<-"Ground Truth"
dat_longX<-dat_long %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(1:12)) 
                               #to get extra scores for indiv point visualization 
dat_longX<-dat_longX %>%subset(!(domain=="lifesat" & value <5.5)) #cut off scores below 5 for life satisfaction for visualization of trends

dat_longX$GT<-"Forecasts from\nIndiv. Teams"

dat_long %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(1:12)) %>% 
   ggplot(aes(x = Month0, y = value, colour = phaseF, fill=phaseF))+
    geom_line(data=subset(dat_longX,phase==1),aes(x = Month, group=team_name), alpha=.4, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2),aes(x = Month, group=team_name), alpha=.4, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange 
  geom_smooth(aes(x = Month0, y = value, colour = phaseF, fill=phaseF),method = "loess") +  
  facet_wrap(~domain, scales = "free", nrow = 4, labeller=labeller(domain=labels))+theme_minimal() +
  geom_line(data=objective,alpha=.8,aes(x = Month0, group=team_name), na.rm=TRUE)+  geom_point(data = objective,alpha=.9,aes(x = Month0)) + #here we add the ground truth markers without confidence band
   theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+ xlim(0,12)+ scale_x_continuous(breaks=c(0:11), labels =c("May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr"))+  labs(colour = "",fill="", x="Months",y="Forecasted & Observed Change")+theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.5)))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

hist_long$phaseF<-hist_long$GT

#reorder levels of the domains
hist_long$domain <- factor(hist_long$domain,      # Reordering group factor levels
                         levels = c("egend","easian","eafric",
                                    "igend","iasian","iafric",
                                    "posaffect","negaffect","lifesat",
                                    "polar","ideoldem","ideolrep"))

labels<-c(
  eafric = "Exp. Bias Vs. Afr.-Am\nhigher=stereotype-consistent\n(-3 to +3)",
  easian = "Exp. Bias Vs. Asian.-Am\nhigher=stereotype-consistent\n(-3 to +3)",
  egend = "Exp. Bias Vs. Women-Career\nhigher=stereotype-consistent\n(-3 to +3)",
  iafric = "Imp. Bias Vs. Afr.-Am.\nhigher=stereotype-consistent\n(IAT D score)",
  iasian = "Imp. Bias Vs. Asian.-Am.\nhigher=stereotype-consistent\n(IAT D score)",
  ideoldem = "Democratic Support\n(% Population)",
  ideolrep ="Republican Support\n(% Population)",
  igend = "Imp. Bias Vs. Women-Career\nhigher=stereotype-consistent\n(IAT D score)",
  lifesat = "Life Satisfaction\nCantril ladder\n(0-10 scale)",
  negaffect = "Negative Affect\nstandardized Vs. historical M/SD\n(z-score)",
  polar = "Polit. Polarization\n% of Rep. Vs. Dem. approvals\n(absolute difference score) ",
  posaffect = "Positive Affect\n(z-score)")

objective.t<-subset(hist_long,Month>0)
hist.t<-subset(hist_long,Month %in% c(-2:-1))
hist.t$phaseF<-"Historical"

# with 3 historical months before the tournament

graph_hist_months<-hist_long%>% subset(Month %in% c(-2:12))%>%
   ggplot(aes(x = Month, y = value, colour = phaseF, fill=phaseF))+  
  theme_pubclean()+
  geom_line(data=objective.t,alpha=.8,aes(x = Month), na.rm=TRUE)+  geom_point(data=objective.t,alpha=.9,aes(x = Month)) +geom_line(data=hist.t,alpha=.8,aes(x = Month), na.rm=TRUE)+  geom_point(data=hist.t,alpha=.9,aes(x = Month))+   theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+ scale_x_continuous(breaks=c(-2:12), labels =c("Feb","","","May","","","Aug","","","Nov","","","Feb","",""))+  labs(colour = "",fill="", x="",y="Forecasted & Observed Change")+facet_wrap(~domain, scales = "free_y", nrow = 4, labeller=labeller(domain=labels))+ 
  geom_line(data=subset(dat_longX,phase==1),aes(x = Month, group=team_name), alpha=.5, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2),aes(x = Month, group=team_name), alpha=.5, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange 
  theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))+geom_smooth(data=dat_long %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(-2:12)), aes(x = Month, y = value, colour = phaseF, fill=phaseF),method = "loess") 

graph_hist_months
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# select just negative affect 
plot.negaffect<-hist_long%>% subset(Month %in% c(-2:12) & domain %in% c("negaffect"))%>%
   ggplot(aes(x = Month, y = value, colour = phaseF, fill=phaseF))+  
  theme_pubclean()+
  geom_line(data=subset(objective.t,domain %in% c("negaffect")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(objective.t, domain %in% c("negaffect")),aes(x = Month)) +geom_line(data=subset(hist.t,domain %in% c("negaffect")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(hist.t,domain %in% c("negaffect")),aes(x = Month))+   theme(legend.position="bottom") +  theme(legend.position="bottom", legend.text = element_text(size=7)) +scale_color_d3()+scale_fill_d3()+ scale_x_continuous(breaks=c(-2:12), labels =c("Feb","","","May","","","Aug","","","Nov","","","Feb","",""))+  labs(colour = "",fill="", x="",y="z-score", title="Negative Affect", subtitle = "Standartized against historical M/SD")+ 
  geom_line(data=subset(dat_longX,phase==1 & domain %in% c("negaffect")),aes(x = Month, group=team_name), alpha=.4, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2 &domain %in% c("negaffect")),aes(x = Month, group=team_name), alpha=.4, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange 
  theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))+geom_smooth(data=dat_long %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(-2:12) & domain %in% c("negaffect")), aes(x = Month, y = value, colour = phaseF, fill=phaseF),method = "loess", alpha=.5) +theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5))

# select pos affect and life satisfaction

plot.LS.and.posaffect<-hist_long%>% subset(Month %in% c(-2:12)& domain %in% c("posaffect", "lifesat"))%>%
   ggplot(aes(x = Month, y = value, colour = phaseF, fill=phaseF))+  
  theme_pubclean()+
geom_line(data=subset(dat_longX,phase==1 & domain %in% c("posaffect", "lifesat")),aes(x = Month, group=team_name), alpha=.4, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2 &domain %in% c("posaffect", "lifesat")),aes(x = Month, group=team_name), alpha=.4, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange
  geom_smooth(data=dat_long  %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(-2:12)& domain %in% c("posaffect", "lifesat")), aes(x = Month, y = value, colour = phaseF, fill=phaseF),method = "loess",alpha=.5)+
  geom_line(data=subset(objective.t,domain %in% c("posaffect", "lifesat")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(objective.t, domain %in% c("posaffect", "lifesat")),aes(x = Month)) +geom_line(data=subset(hist.t,domain %in% c("posaffect", "lifesat")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(hist.t,domain %in% c("posaffect", "lifesat")),aes(x = Month))+
  theme(legend.position="none") +scale_color_d3()+scale_fill_d3()+ scale_x_continuous(breaks=c(-2:12), labels =c("Feb","","","May","","","Aug","","","Nov","","","Feb","",""))+  labs(colour = "",fill="", x="",y="")+facet_wrap(~domain, scales = "free_y", nrow = 4, labeller=labeller(domain=labels))+ theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))

plot.wb<-ggarrange(plot.negaffect,plot.LS.and.posaffect,  ncol=2, nrow=1,widths=c(2,1))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#graph for slides

##graph for paper
# select just negative affect 
plot.negaffectX<-hist_long%>% subset(Month %in% c(-2:12) & domain %in% c("negaffect"))%>%
   ggplot(aes(x = Month, y = value, colour = phaseF, fill=phaseF))+  
  theme_pubclean()+
  geom_line(data=subset(dat_longX,phase==1 & domain %in% c("negaffect")),aes(x = Month, group=team_name), alpha=.4, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2 &domain %in% c("negaffect")),aes(x = Month, group=team_name), alpha=.4, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange 
  geom_smooth(data=dat_long %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(-2:12) & domain %in% c("negaffect")), aes(x = Month, y = value, colour = phaseF, fill=phaseF),method = "loess",alpha=.5) +
  geom_line(data=subset(objective.t,domain %in% c("negaffect")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(objective.t, domain %in% c("negaffect")),aes(x = Month)) +geom_line(data=subset(hist.t,domain %in% c("negaffect")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(hist.t,domain %in% c("negaffect")),aes(x = Month))+
  theme(legend.position="none") +scale_color_d3()+scale_fill_d3()+ scale_x_continuous(breaks=c(-2:12), labels =c("Feb","","","May","","","Aug","","","Nov","","","Feb","",""))+  labs(colour = "",fill="", x="",y="z-score", title="Negative Affect", subtitle = "Standardized against historical M/SD")+ 
  theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))+theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5))




plot.wbX<-ggarrange(plot.negaffectX,plot.LS.and.posaffect,  ncol=2, nrow=1,widths=c(1.8,1))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#biases and politics
plot.all.but.WB<-hist_long%>% subset(Month %in% c(-2:12)& domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep"))%>%
   ggplot(aes(x = Month, y = value, colour = phaseF, fill=phaseF))+  
  theme_pubclean()+
  geom_line(data=subset(dat_longX,phase==1 & domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month, group=team_name), alpha=.4, color="#aec7e8", na.rm=TRUE)+ #phase1 in light blue 
    geom_line(data=subset(dat_longX,phase==2 &domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month, group=team_name), alpha=.4, color="#ffbb78", na.rm=TRUE)+ #phase1 in light orange 
  geom_smooth(data=dat_long  %>% subset(!is.na(Method.code)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" &Method.code!="Lay People"& Month %in% c(-2:12)& domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")), aes(x = Month, y = value, colour = phaseF, fill=phaseF),method = "loess",alpha=.5)+
  geom_line(data=subset(objective.t,domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(objective.t, domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month)) +geom_line(data=subset(hist.t,domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month), na.rm=TRUE)+  geom_point(data=subset(hist.t,domain %in% c("egend","easian","eafric",
                                    "igend","iasian","iafric","polar","ideoldem","ideolrep")),aes(x = Month))+
  theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+ scale_x_continuous(breaks=c(-2:12), labels =c("Feb","","","May","","","Aug","","","Nov","","","Feb","",""))+  labs(colour = "",fill="", x="",y="")+facet_wrap(~domain, scales = "free_y", nrow = 4, labeller=labeller(domain=labels))+ theme(axis.text.x = element_text(angle=45, vjust=.5, hjust=1, size=rel(0.8)))

#combine into megaplot
### Fig. S1. in the manuscript

plot.all<-ggarrange(plot.wbX,plot.all.but.WB,  ncol=1, nrow=2, heights = c(1.2,1.8),  common.legend = TRUE, legend="bottom")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
plot.all

###############################################################
#graph individual predictions and ground truth markers - IN THE SUPPLEMENT in the PAPER
#END
###############################################################

Visualizations of MASE versus benchmarks

###############################################################
#graph individual predictions and ground truth markers - FIGURE 2 IN THE SUPPLEMENT in the PAPER, as well as one FIGURE in the MAIN TEXT)
#ALSO: analyses of scientists versus lay people in tournament 1
#BEGINNING
##this one includes creating model estimates for tournament 1 and tournament 2 for academics (and lay people in tournament 1 - we focus on linear mixed model estimates to account for interdependence in predictions), saving mean estimates and CIs, combining with benchmarks, and designing plots of various caliber for the paper
###############################################################
pd <- position_dodge(0.7) # move them .07 to the left and right

##by method for phase 1
###inspect data for distribution properties

hist(log(phase1$MASE1_w1)) #possibly do it on logs?

describe(phase1$MASE1_w1)
## phase1$MASE1_w1 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##     1850       12     1756        1    6.134     3.91    6.902   0.9201 
##      .10      .25      .50      .75      .90      .95 
##   1.1955   1.8012   3.4322   5.8612   9.8178  16.3946 
## 
## lowest : 0.295347 0.328552 0.369158 0.39811  0.432914
## highest: 174.196  179.452  187.59   199.385  244.305
#analyses of phase 1  - MASE overall, without an interaction
model.phase1.together.nodomain.interac<-  lmer(log(MASE1_w1)~isExpert.factor+domain+(1|ResponseId), data=phase1)
car::Anova(model.phase1.together.nodomain.interac,type="III", test.statistic="F") #sig main effect, but only if we don't include interaction. 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                       F Df  Df.res                Pr(>F)    
## (Intercept)     248.151  1 1051.66 < 0.00000000000000022 ***
## isExpert.factor  17.667  1  628.37            0.00003014 ***
## domain           81.936 11 1572.37 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase1.together.nodomain.interac, digits = 8)
Observations 1826
Dependent variable log(MASE1_w1)
Type Mixed effects linear regression
AIC 4148.36143420
BIC 4231.00968012
Pseudo-R² (fixed effects) 0.28628897
Pseudo-R² (total) 0.60430920
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.47502745 0.09358737 15.76096651 880.78281180 0.00000000
isExpert.factorProlific 0.31898754 0.07586433 4.20471041 482.67098452 0.00003116
domaineasian -0.18587961 0.08613383 -2.15803261 1811.08366472 0.03105638
domainegend -1.22881293 0.08590341 -14.30458788 1812.79819099 0.00000000
domainiafric 0.20678695 0.07571346 2.73117821 1092.14529567 0.00641248
domainiasian -0.72575702 0.08576178 -8.46247585 1807.87422190 0.00000000
domainideoldem -0.41555537 0.08294878 -5.00978262 1812.96458414 0.00000060
domainideolrep 0.16065977 0.08285865 1.93896191 1812.99186651 0.05266097
domainigend -0.96913414 0.08590733 -11.28115740 1812.98194788 0.00000000
domainlifesat 0.05830001 0.08475695 0.68784930 1795.91370313 0.49163652
domainnegaffect -0.38151899 0.08190778 -4.65790893 1812.80024277 0.00000343
domainpolar -0.73440997 0.08648695 -8.49156929 1797.67464735 0.00000000
domainposaffect -1.20124279 0.08173868 -14.69613706 1812.92195389 0.00000000
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.53115840
Residual 0.59248130
Grouping Variables
Group # groups ICC
ResponseId 888 0.44558682
partR2(model.phase1.together.nodomain.interac,partvars=c("isExpert.factor","domain"), nboot=100, parallel=T)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.2863 0.2664   0.3193   100   13 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)           R2     CI_lower CI_upper nboot ndf
##  Model                  0.2863 0.2664   0.3193   100   13 
##  isExpert.factor        0.0132 0.0000   0.0498   100   12 
##  domain                 0.2785 0.2588   0.3118   100    2 
##  isExpert.factor+domain 0.2863 0.2664   0.3193   100    1
#Pseudo-R² (fixed effects) =  0.28628897


model.phase1.together<-  lmer(log(MASE1_w1)~domain*isExpert.factor+(1|ResponseId), data=phase1)
car::Anova(model.phase1.together,type="III", test.statistic="F") #sig interaction!
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                               F Df Df.res               Pr(>F)    
## (Intercept)            118.3417  1 1645.8 < 0.0000000000000002 ***
## domain                  24.9028 11 1064.2 < 0.0000000000000002 ***
## isExpert.factor          0.8810  1 1747.0              0.34805    
## domain:isExpert.factor   1.9951 11 1304.0              0.02566 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase1.MASE.together<-as.data.frame(emmeans(model.phase1.together, pairwise~domain*isExpert.factor, adjust = "none", type = "response")$emmeans %>% rbind(adjust=T)) #backtransformed to the original scale and adjusted for simulatenous inference
plot(model.phase1.together) #residuals show adequate transformation ( compared to non-transformed and sqroot transformations)

summ(model.phase1.together, digits = 8)
Observations 1826
Dependent variable log(MASE1_w1)
Type Mixed effects linear regression
AIC 4170.07746907
BIC 4313.33442866
Pseudo-R² (fixed effects) 0.28053205
Pseudo-R² (total) 0.60819241
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.59866729 0.14691650 10.88146888 1584.40109504 0.00000000
domaineasian -0.30243777 0.17461164 -1.73205959 831.50746601 0.08363380
domainegend -1.50193396 0.18234704 -8.23667878 816.78404083 0.00000000
domainiafric 0.25355978 0.17470656 1.45134663 817.63984391 0.14706692
domainiasian -0.76691735 0.16988428 -4.51435139 843.46708452 0.00000725
domainideoldem -0.44717744 0.16881110 -2.64898131 874.99959714 0.00821899
domainideolrep 0.10736930 0.16881110 0.63603224 874.99959726 0.52492173
domainigend -1.22643604 0.18046643 -6.79592338 817.01544425 0.00000000
domainlifesat -0.20365691 0.15865095 -1.28367906 991.09101911 0.19955431
domainnegaffect -0.19913131 0.17224400 -1.15610013 834.00226264 0.24797127
domainpolar -0.92686152 0.17091229 -5.42302439 883.69004387 0.00000008
domainposaffect -1.31752331 0.17224400 -7.64916800 834.00226246 0.00000000
isExpert.factorProlific 0.15232944 0.16224955 0.93885896 1723.37784434 0.34793472
domaineasian:isExpert.factorProlific 0.15851929 0.20133390 0.78734528 1215.12307216 0.43123332
domainegend:isExpert.factorProlific 0.37756226 0.20709589 1.82312774 1156.76331311 0.06854204
domainiafric:isExpert.factorProlific -0.06726934 0.19361509 -0.34743854 871.54256353 0.72834577
domainiasian:isExpert.factorProlific 0.06169366 0.19755666 0.31228335 1246.84902524 0.75487741
domainideoldem:isExpert.factorProlific 0.05363104 0.19447323 0.27577597 1267.23179121 0.78276511
domainideolrep:isExpert.factorProlific 0.08088417 0.19440699 0.41605587 1267.25721123 0.67743957
domainigend:isExpert.factorProlific 0.36388700 0.20561110 1.76978284 1164.06270536 0.07702507
domainlifesat:isExpert.factorProlific 0.39494835 0.18939213 2.08534724 1394.94214271 0.03721920
domainnegaffect:isExpert.factorProlific -0.19255930 0.19649575 -0.97996676 1202.26563521 0.32729968
domainpolar:isExpert.factorProlific 0.26315832 0.19882045 1.32359787 1261.94151557 0.18587636
domainposaffect:isExpert.factorProlific 0.16337227 0.19638770 0.83188647 1202.24770522 0.40563823
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.53714810
Residual 0.58737910
Grouping Variables
Group # groups ICC
ResponseId 888 0.45542037
#Pseudo-R² (fixed effects) = 0.28053205

partR2(model.phase1.together,partvars=c("isExpert.factor","isExpert.factor:domain"), nboot=100, parallel=T)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.2805 0.261    0.3171   100   24 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                           R2     CI_lower CI_upper nboot ndf
##  Model                                  0.2805 0.261    0.3171   100   24 
##  isExpert.factor                        0.0000 0.000    0.0494   100   24 
##  isExpert.factor:domain                 0.0000 0.000    0.0422   100   13 
##  isExpert.factor+isExpert.factor:domain 0.0054 0.000    0.0543   100   12
#test the difference between experts and lay people
model.layVSsci.phase1<-  lmer(log(MASE1_w1)~domain*isExpert.factor+(1|ResponseId), data=phase1)
car::Anova(model.layVSsci.phase1,type="III",test.statistic="F") #sig interaction!
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                               F Df Df.res               Pr(>F)    
## (Intercept)            118.3417  1 1645.8 < 0.0000000000000002 ***
## domain                  24.9028 11 1064.2 < 0.0000000000000002 ***
## isExpert.factor          0.8810  1 1747.0              0.34805    
## domain:isExpert.factor   1.9951 11 1304.0              0.02566 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase1.MASE.explaycomp<-as.data.frame(emmeans(model.layVSsci.phase1,revpairwise ~isExpert.factor|domain,  type="response")$contrasts%>% rbind(adjust="fdr")) #get the estimates in a dataframe

emmeans(model.layVSsci.phase1,revpairwise ~isExpert.factor|domain,  type="response")$contrasts%>%
     rbind(adjust="fdr") #get FDR correction across all pairwise tests
##  domain    contrast            ratio    SE   df null t.ratio p.value
##  eafric    Prolific / Academic 1.165 0.189 1747    1   0.939  0.4177
##  easian    Prolific / Academic 1.365 0.212 1789    1   1.998  0.0917
##  egend     Prolific / Academic 1.699 0.281 1731    1   3.204  0.0064
##  iafric    Prolific / Academic 1.089 0.173 1780    1   0.536  0.6459
##  iasian    Prolific / Academic 1.239 0.185 1802    1   1.430  0.2037
##  ideoldem  Prolific / Academic 1.229 0.174 1795    1   1.456  0.2037
##  ideolrep  Prolific / Academic 1.263 0.179 1794    1   1.650  0.1701
##  igend     Prolific / Academic 1.676 0.274 1747    1   3.161  0.0064
##  lifesat   Prolific / Academic 1.729 0.219 1725    1   4.321  0.0002
##  negaffect Prolific / Academic 0.961 0.143 1796    1  -0.271  0.7865
##  polar     Prolific / Academic 1.515 0.223 1802    1   2.819  0.0146
##  posaffect Prolific / Academic 1.371 0.203 1796    1   2.128  0.0804
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: fdr method for 12 tests 
## Tests are performed on the log scale
eff_size(emmeans(model.layVSsci.phase1,revpairwise ~isExpert.factor|domain,  type="response"),sigma = sigma(model.layVSsci.phase1), edf =df.residual(model.layVSsci.phase1)) %>% rbind(adjust="mvt") #using the smallest DF among the three #adjusting for muliple testing via mvt (which is less conservative and more "exact' compared to Bonferroni)
## Since 'object' is a list, we are using the contrasts already present.
##  domain    contrast              estimate    SE   df lower.CL upper.CL
##  eafric    (Prolific - Academic)   0.2593 0.276 1747 -0.52767    1.046
##  easian    (Prolific - Academic)   0.5292 0.265 1789 -0.22551    1.284
##  egend     (Prolific - Academic)   0.9021 0.282 1731  0.09896    1.705
##  iafric    (Prolific - Academic)   0.1448 0.270 1780 -0.62467    0.914
##  iasian    (Prolific - Academic)   0.3644 0.255 1802 -0.36137    1.090
##  ideoldem  (Prolific - Academic)   0.3506 0.241 1795 -0.33548    1.037
##  ideolrep  (Prolific - Academic)   0.3970 0.241 1794 -0.28871    1.083
##  igend     (Prolific - Academic)   0.8788 0.278 1747  0.08591    1.672
##  lifesat   (Prolific - Academic)   0.9317 0.216 1725  0.31606    1.547
##  negaffect (Prolific - Academic)  -0.0685 0.253 1796 -0.78866    0.652
##  polar     (Prolific - Academic)   0.7074 0.251 1802 -0.00795    1.423
##  posaffect (Prolific - Academic)   0.5375 0.253 1796 -0.18243    1.257
## 
## Degrees-of-freedom method: inherited from kenward-roger when re-gridding 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates
#test Bayesian version of full model and estimated simple effects.
phase1$MASE1_w1_log<-log(phase1$MASE1_w1)
stan_model <- stan_lmer(MASE1_w1_log ~ domain*isExpert.factor + (1|ResponseId), data = phase1,
                        #prior = cauchy(0, 0.707),          # as per Rouder et al., 2012 |used before - changed to autoscale
                        prior_intercept = student_t(3,0,10),          # weakly informative
                        prior_aux = exponential(.1),                  # weakly informative
                        prior_covariance = decov(1,1,1,1))            # weakly informative
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.001134 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 11.34 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 75.226 seconds (Warm-up)
## Chain 1:                17.913 seconds (Sampling)
## Chain 1:                93.139 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0.000235 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 2.35 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 50.717 seconds (Warm-up)
## Chain 2:                16.816 seconds (Sampling)
## Chain 2:                67.533 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0.000398 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 3.98 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 56.15 seconds (Warm-up)
## Chain 3:                14.91 seconds (Sampling)
## Chain 3:                71.06 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 0.000178 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 1.78 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 53.247 seconds (Warm-up)
## Chain 4:                18.556 seconds (Sampling)
## Chain 4:                71.803 seconds (Total)
## Chain 4:
em_expert_simple <- emmeans(stan_model, revpairwise~isExpert.factor | domain,  type="response")

bayesfactor_parameters(em_expert_simple, prior = stan_model)
## Sampling priors, please wait...
## Warning: Bayes factors might not be precise.
##   For precise Bayes factors, sampling at least 40,000 posterior samples is
##   recommended.
## Bayes Factor (Savage-Dickey density ratio)
## 
## domain    | isExpert.factor |            contrast |       BF
## ------------------------------------------------------------
## eafric    |        Academic |                     | 2.35e+08
## eafric    |        Prolific |                     | 1.23e+23
## easian    |        Academic |                     | 2.62e+06
## easian    |        Prolific |                     | 2.48e+18
## egend     |        Academic |                     |    0.013
## egend     |        Prolific |                     | 2.47e+05
## iafric    |        Academic |                     | 8.65e+13
## iafric    |        Prolific |                     | 2.15e+26
## iasian    |        Academic |                     |   675.15
## iasian    |        Prolific |                     | 3.56e+11
## ideoldem  |        Academic |                     | 9.90e+06
## ideoldem  |        Prolific |                     | 5.09e+16
## ideolrep  |        Academic |                     | 5.60e+14
## ideolrep  |        Prolific |                     | 6.92e+25
## igend     |        Academic |                     |    0.229
## igend     |        Prolific |                     | 1.12e+09
## lifesat   |        Academic |                     | 4.66e+10
## lifesat   |        Prolific |                     | 3.38e+23
## negaffect |        Academic |                     | 5.62e+07
## negaffect |        Prolific |                     | 6.24e+15
## polar     |        Academic |                     |   157.50
## polar     |        Prolific |                     | 4.10e+10
## posaffect |        Academic |                     |    0.085
## posaffect |        Prolific |                     | 1.09e+08
## eafric    |                 | Prolific - Academic |    0.042
## easian    |                 | Prolific - Academic |    0.104
## egend     |                 | Prolific - Academic |     1.75
## iafric    |                 | Prolific - Academic |    0.016
## iasian    |                 | Prolific - Academic |    0.038
## ideoldem  |                 | Prolific - Academic |    0.036
## ideolrep  |                 | Prolific - Academic |    0.055
## igend     |                 | Prolific - Academic |     1.76
## lifesat   |                 | Prolific - Academic |    10.62
## negaffect |                 | Prolific - Academic |    0.016
## polar     |                 | Prolific - Academic |    0.485
## posaffect |                 | Prolific - Academic |    0.165
## 
## * Evidence Against The Null: 0
#phase 2
dat_phase2<-academic_only %>%filter(!(phase == 1 & revised == 1)) #just academics, omitting original (non-revised phase 1)
model.phase2.together<-  lmer(log(MASE1_w2)~domain+(1|team_name), data=dat_phase2)
car::Anova(model.phase2.together,type="III") #sig interaction!
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(MASE1_w2)
##               Chisq Df            Pr(>Chisq)    
## (Intercept)  18.588  1            0.00001623 ***
## domain      296.166 11 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase2.MASE.together<-as.data.frame(emmeans(model.phase2.together, pairwise~domain, type = "response")$emmeans %>%rbind(adjust="mvt")) #backtransformed to the original scale #adjust for MVT

data.phase1.MASE.together$Wave<-"First Tournament (May 2020)"
data.phase1.MASE.together$Type[data.phase1.MASE.together$isExpert.factor=="Academic"]<-"Scientists"
data.phase1.MASE.together$Type[data.phase1.MASE.together$isExpert.factor=="Prolific"]<-"Naive Crowd"


data.phase2.MASE.together$Wave<-"Second Tournament (Nov 2020)"
data.phase2.MASE.together$Type<-"Scientists"

#add simulation benchmarks & combine
means.compare.to.naive<-bind_rows(data.phase1.MASE.together,data.phase2.MASE.together,sim.w1,sim.w2)

#arrange in descending order based on MASE w1 of academics
means.compare.to.naive$domain<-factor(means.compare.to.naive$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend"))

#arrange in order of tournament factors
means.compare.to.naive$Wave<-factor(means.compare.to.naive$Wave,levels=c("First Tournament (May 2020)","Second Tournament (Nov 2020)"))

#arrange groups
means.compare.to.naive$Type<-factor(means.compare.to.naive$Type,levels=c("Scientists","Naive Crowd","Historic Mean","Random Walk","Linear Regression"))

#add var for Scientists vs. rest (to define colors)
means.compare.to.naive$Group[means.compare.to.naive$Type=="Scientists"]<-"Estimate"
means.compare.to.naive$Group[means.compare.to.naive$Type!="Scientists"]<-"Non Estimate"

labeling<-c(
  eafric = "Exp. Afr.-Am. Bias",
  easian = "Exp. Asian-Am. Bias",
  egend = "Exp. Gender Bias",
  iafric = "Imp. Afr.-Am. Bias",
  iasian = "Imp. Asian-Am. Bias",
  ideoldem = "Democrat. Support",
  ideolrep ="Republic. Support",
  igend = "Imp. Gender Bias",
  lifesat = "Life Satisfaction",
  negaffect = "Negative Affect",
  polar = "Polarization",
  posaffect = "Positive Affect")

#plot for the supplement
#### Fig. S2. in the manuscript
means.compare.to.naive %>%  
 ggplot(aes(x = response, y = domain, color = Type, shape=Type))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_vline(xintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="bottom")+scale_color_jama()+  labs(x="Forecasting Error - MASE (M +/- 95%CI)",shape="",color="")+scale_y_discrete(labels=labeling, name="")+facet_grid(~Wave)
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.

#create a main text version with top (lowest MASE) benchmark per domain instead of all three benchmarks
##first, get lowest benchmarks per domain
sim.w1.top<-sim.w1 %>% dplyr::select(domain,Mean) %>% summarise(response = min(Mean), Wave="First Tournament (May 2020)")
sim.w2.top<-sim.w2 %>% dplyr::select(domain,Mean) %>% summarise(response = min(Mean), Wave="Second Tournament (Nov 2020)")
#add simulation benchmarks & combine
means.compare.to.naive.top<-bind_rows(data.phase1.MASE.together,data.phase2.MASE.together,sim.w1.top,sim.w2.top)
#arrange in descending order based on MASE w1 of academics
means.compare.to.naive.top$domain<-factor(means.compare.to.naive.top$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend"))
#arrange in order of tournament factors
means.compare.to.naive.top$Wave<-factor(means.compare.to.naive.top$Wave,levels=c("First Tournament (May 2020)","Second Tournament (Nov 2020)"))
#arrange groups
means.compare.to.naive.top$Type[is.na(means.compare.to.naive.top$Type)==T]<-"Naive Statistic"
means.compare.to.naive.top$Type<-factor(means.compare.to.naive.top$Type,levels=c("Scientists","Naive Crowd","Naive Statistic"))
#add var for Scientists vs. rest (to define colors)
means.compare.to.naive.top$Group[means.compare.to.naive.top$Type=="Scientists"]<-"Estimate"
means.compare.to.naive.top$Group[means.compare.to.naive.top$Type!="Scientists"]<-"Non Estimate"

#plot for the main text
#### Figure 1. in the manuscript

means.compare.to.naive.top %>%  
 ggplot(aes(x = response, y = domain, color = Type, shape=Type))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_vline(xintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="bottom")+scale_color_jama()+  labs(x="Forecasting Error - MASE (M +/- 95%CI)",shape="",color="")+scale_y_discrete(labels=labeling, name="")+facet_grid(~Wave)
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## Removed 12 rows containing missing values or values outside the scale range
## (`geom_segment()`).

#create an alternative version with a mean of MASE of benchmarks per domain instead of all three benchmarks
##first, get lowest benchmarks per domain
sim.w1.mean<-sim.w1 %>% dplyr::select(domain,Mean) %>% summarise(response = mean(Mean), Wave="First Tournament (May 2020)")
sim.w2.mean<-sim.w2 %>% dplyr::select(domain,Mean) %>% summarise(response = mean(Mean), Wave="Second Tournament (Nov 2020)")
#add simulation benchmarks & combine
means.compare.to.naive.mean<-bind_rows(data.phase1.MASE.together,data.phase2.MASE.together,sim.w1.mean,sim.w2.mean)
#arrange in descending order based on MASE w1 of academics
means.compare.to.naive.mean$domain<-factor(means.compare.to.naive.mean$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend"))
#arrange in order of tournament factors
means.compare.to.naive.mean$Wave<-factor(means.compare.to.naive.mean$Wave,levels=c("First Tournament (May 2020)","Second Tournament (Nov 2020)"))
#arrange groups
means.compare.to.naive.mean$Type[is.na(means.compare.to.naive.mean$Type)==T]<-"Average of Naive Statistics"
means.compare.to.naive.mean$Type<-factor(means.compare.to.naive.mean$Type,levels=c("Scientists","Naive Crowd","Average of Naive Statistics"))
#add var for Scientists vs. rest (to define colors)
means.compare.to.naive.mean$Group[means.compare.to.naive.mean$Type=="Scientists"]<-"Estimate"
means.compare.to.naive.mean$Group[means.compare.to.naive.mean$Type!="Scientists"]<-"Non Estimate"

##ADDITIONAL PLOT TO EXAMINE DIFFERENCES OF SCIENTIST FORECASTS AGAINST THE AVERAGE OF NAIVE STATISTICS
means.compare.to.naive.mean %>%  
 ggplot(aes(x = response, y = domain, color = Type, shape=Type))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_vline(xintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="bottom")+scale_color_jama()+  labs(x="Forecasting Error - MASE (M +/- 95%CI)",shape="",color="")+scale_y_discrete(labels=labeling, name="")+facet_grid(~Wave)
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## Removed 12 rows containing missing values or values outside the scale range
## (`geom_segment()`).

##

#subplots for presentation (talks, etc)

#scientists
means.compare.to.naive %>%  subset(Type=="Scientists")%>%  
 ggplot(aes(x = response, y = domain, color = Type, shape=Type))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_vline(xintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="bottom")+scale_color_jama()+  labs(x="Forecasting Error - MASE (M +/- 95%CI)",shape="",color="")+scale_y_discrete(labels=labeling, name="")+facet_grid(~Wave)
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.

#scientists & lay crowd
means.compare.to.naive %>%  subset(Type=="Scientists"|Type=="Naive Crowd")%>%  
 ggplot(aes(x = response, y = domain, color = Type, shape=Type))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_vline(xintercept =1, linetype='dotted', color='black',14)+
  geom_vline(xintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="bottom")+scale_color_jama()+  labs(x="Forecasting Error - MASE (M +/- 95%CI)",shape="",color="")+scale_y_discrete(labels=labeling, name="")+facet_grid(~Wave)
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.

###############################################################
#graph individual predictions and ground truth markers - FIGURE 2 IN THE SUPPLEMENT in the PAPER, as well as one FIGURE in the MAIN TEXT)
#ALSO: analyses of scientists versus lay people in tournament 1
#END

#add rainclouds to inspect distributions of responses to see if there are outliers!
phase1$domain<-factor(phase1$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend")) #order by mean accuracy in T1

#### tournament 1 data
phase1 %>%
  ggplot(aes(x = domain, y = MASE1_w1,fill=domain))+ 
 # stat_slab(side = "left", scale = 0.5, position = "dodge") +
  stat_dotsinterval(quantiles = 100, position = "dodge") +
 scale_x_discrete(labels=labeling)+scale_fill_tq()+
  theme_minimal()+
  labs(title = "First Tournament (May 2020)",
       y = "MASE",
      x = "")+
  coord_flip()+
    theme(legend.position = "none")+ylim(0,50)+
  geom_boxplot(
    width = .12,
    ## remove outliers
    alpha = 0.5
      )+ geom_point(data=subset(data.phase1.MASE.together, isExpert.factor %in% c("Academic")),alpha=.9,aes(y = response), size=3.5, shape=7, colour="red") 
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`stat_slabinterval()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

dat_phase2$domain<-factor(dat_phase2$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend")) #order by mean accuracy in T1

#### tournament 1 data

dat_phase2 %>%
  ggplot(aes(x = domain, y = MASE1_w2,fill=domain))+ 
  stat_dotsinterval( quantiles = 100, position = "dodge") +
 scale_fill_tq()+scale_x_discrete(labels=labeling)+
  theme_minimal()+
  labs(title = "Second Tournament (Nov 2020)",
       y = "MASE",
      x = "")+
  coord_flip()+
    theme(legend.position = "none")+ylim(0,40)+
  geom_boxplot(
    width = .12,
    ## remove outliers
    #outlier.color = NA,
    alpha = 0.5
      )+ geom_point(data=data.phase2.MASE.together,alpha=.9,aes(y = response), size=3.5, shape=7, colour="red") 

#statistical tests of difference from benchmark

########################################################################################
#THIS SECTION INCLUDES STATISTICAL TESTS AGAINST THE BENCHMARKS AND THEIR VISUALIZATION
########################################################################################
#to examine difference in inaccuracy from benchmark vs. domain estimates from scientists in the LME, we can do the following:

#1. create ratio of  benchmark inaccuracy to forecasting inaccuracy -  score above 1 means forecast is more accurate compared to the benchmark
#2 run an intercept model, to see if intercept is sig different from 1

##Tournament 1 - phase1_exp

phase1_exp_wbench<-phase1_exp %>% left_join(pivot_wider(sim.w1 %>% dplyr::select(domain,response, source),
                                            names_from="source",values_from="response"))
## Joining with `by = join_by(domain)`
phase1_exp_wbench$MASE_ratio1<- phase1_exp_wbench$'Benchmark 1'/phase1_exp_wbench$MASE1_w1
phase1_exp_wbench$MASE_ratio2<- phase1_exp_wbench$'Benchmark 2'/phase1_exp_wbench$MASE1_w1
phase1_exp_wbench$MASE_ratio3<- phase1_exp_wbench$'Benchmark 3'/phase1_exp_wbench$MASE1_w1

phase1_exp_wbench$domain <- factor(phase1_exp_wbench$domain,      # Reordering group factor levels
                         levels = c("ideolrep","ideoldem","polar",
                                    "lifesat","negaffect","posaffect",
                                    "iafric","iasian","igend",
                                    "eafric","easian","egend" ))

###Test performance against all three benchmarks in a linear mixed model - ratio scores nested in participants

phase1_exp_wbench_long<-phase1_exp_wbench %>% pivot_longer(MASE_ratio1:MASE_ratio3,names_to="Benchmark",values_to="scores")
#use logs due to proportion scores
model.phase1.all.ratio<-  lmer(log(scores)~domain*Benchmark+(1|team_name), data=phase1_exp_wbench_long)

Anova(model.phase1.all.ratio, type=3)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(scores)
##                     Chisq Df            Pr(>Chisq)    
## (Intercept)        0.6362  1              0.425094    
## domain           155.0087 11 < 0.00000000000000022 ***
## Benchmark         13.7257  2              0.001046 ** 
## domain:Benchmark 400.2861 22 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(model.phase1.all.ratio,~domain, type="response") %>% rbind(adjust="mvt")
## NOTE: Results may be misleading due to involvement in interactions
##  domain    response     SE  df lower.CL upper.CL
##  ideolrep     0.909 0.0716 154    0.736     1.12
##  ideoldem     0.959 0.0755 154    0.776     1.18
##  polar        1.318 0.1050 159    1.064     1.63
##  lifesat      1.184 0.0863 115    0.973     1.44
##  negaffect    0.874 0.0706 170    0.703     1.09
##  posaffect    1.050 0.0848 170    0.845     1.30
##  iafric       0.828 0.0683 184    0.663     1.03
##  iasian       0.983 0.0788 164    0.793     1.22
##  igend        1.460 0.1230 199    1.164     1.83
##  eafric       0.860 0.0723 198    0.686     1.08
##  easian       1.143 0.0935 178    0.917     1.42
##  egend        1.550 0.1320 205    1.234     1.95
## 
## Results are averaged over some or all of the levels of: Benchmark 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the log scale
#nine out of 12 domains show no significant difference from zero (includes 1)

plot.t1.avBenchmark<-plot(emmeans(model.phase1.all.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Domain")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 1 (May 2020)")
## NOTE: Results may be misleading due to involvement in interactions
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
#9 domains with no difference


###Test individual markers

#skewness test suggests that sqrt is the most reasonable transformation across the three metrics (esp. the first one) hence we will use it.
model.phase1.hist.mean.ratio<-  lmer(sqrt(MASE_ratio1)~domain+(1|team_name), data=phase1_exp_wbench)
Anova(model.phase1.hist.mean.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: sqrt(MASE_ratio1)
##             F Df Df.res                Pr(>F)    
## domain 10.295 11 320.03 0.0000000000000003915 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
phase1.hist.mean.ratio.means <- as.data.frame(emmeans(model.phase1.hist.mean.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase1.hist.mean.ratio.means$Estimate<-"Historical Mean"
phase1.hist.mean.ratio.means$Date<-"Tournament 1 (May 2020)"
phase1.hist.mean.ratio.means
##  domain     response         SE     df  lower.CL upper.CL Estimate       
##  ideolrep  1.3273637 0.09922746 345.27 1.0573899 1.627999 Historical Mean
##  ideoldem  1.4641364 0.10421439 345.27 1.1798214 1.779114 Historical Mean
##  polar     1.6657083 0.11450847 346.03 1.3527381 2.011217 Historical Mean
##  lifesat   0.8004701 0.05957967 341.56 0.6383166 0.980957 Historical Mean
##  negaffect 1.1477488 0.10159925 346.57 0.8742164 1.458456 Historical Mean
##  posaffect 1.1485555 0.10163495 346.57 0.8749209 1.459364 Historical Mean
##  iafric    0.9847649 0.09955364 346.88 0.7193317 1.291796 Historical Mean
##  iasian    0.7767266 0.08076865 346.21 0.5618577 1.026311 Historical Mean
##  igend     1.1436895 0.11428273 346.98 0.8387078 1.495872 Historical Mean
##  eafric    1.2949213 0.12160876 346.97 0.9688753 1.668172 Historical Mean
##  easian    0.7860403 0.08720597 346.91 0.5553034 1.056766 Historical Mean
##  egend     1.5248937 0.13501181 346.86 1.1614167 1.937779 Historical Mean
##  Date                   
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the sqrt scale
#here are the results of the historical mean tests for Tournament 1

plot.t1.hist.mean<-plot(emmeans(model.phase1.hist.mean.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Historical Mean")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 1 (May 2020)")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
model.phase1.randwalk.ratio<-  lmer(sqrt(MASE_ratio2)~domain+(1|team_name), data=phase1_exp_wbench)
Anova(model.phase1.randwalk.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: sqrt(MASE_ratio2)
##             F Df Df.res                Pr(>F)    
## domain 14.759 11 314.07 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
phase1.randwalk.ratio.means<-as.data.frame(emmeans(model.phase1.randwalk.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase1.randwalk.ratio.means$Estimate<-"Random Walk"
phase1.randwalk.ratio.means$Date<-"Tournament 1 (May 2020)"
phase1.randwalk.ratio.means
##  domain     response        SE     df  lower.CL upper.CL Estimate   
##  ideolrep  1.1115154 0.1168952 341.55 0.8009296 1.472878 Random Walk
##  ideoldem  1.0037219 0.1110824 341.55 0.7098496 1.348368 Random Walk
##  polar     1.7099008 0.1491641 343.56 1.3080761 2.165465 Random Walk
##  lifesat   1.8081101 0.1157576 329.28 1.4906471 2.156196 Random Walk
##  negaffect 0.9931435 0.1213288 345.71 0.6750677 1.372428 Random Walk
##  posaffect 1.8522021 0.1656922 345.71 1.4066302 2.358983 Random Walk
##  iafric    0.9849081 0.1276531 346.60 0.6522153 1.385923 Random Walk
##  iasian    1.7274176 0.1547831 344.56 1.3112177 2.200894 Random Walk
##  igend     2.7060627 0.2249634 346.99 2.0981832 3.391167 Random Walk
##  eafric    0.9343895 0.1321611 346.94 0.5931838 1.352784 Random Walk
##  easian    1.8144686 0.1698508 346.64 1.3591747 2.335419 Random Walk
##  egend     2.4534150 0.2189391 346.76 1.8645707 3.122937 Random Walk
##  Date                   
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the sqrt scale
#here are the results of the random walk tests for Tournament 1

plot.t1.randwalk<-plot(emmeans(model.phase1.randwalk.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Random Walk")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
model.phase1.linreg.ratio<-  lmer(sqrt(MASE_ratio3)~domain+(1|team_name), data=phase1_exp_wbench)
Anova(model.phase1.linreg.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: sqrt(MASE_ratio3)
##             F Df Df.res                Pr(>F)    
## domain 11.184 11 313.44 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
phase1.linreg.ratio.means<-as.data.frame(emmeans(model.phase1.linreg.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase1.linreg.ratio.means$Estimate<-"Linear Regression"
phase1.linreg.ratio.means$Date<-"Tournament 1 (May 2020)"
phase1.linreg.ratio.means
##  domain     response         SE     df  lower.CL upper.CL Estimate         
##  ideolrep  0.9252696 0.09920025 341.05 0.6621374 1.232323 Linear Regression
##  ideoldem  1.0141383 0.10385496 341.05 0.7376328 1.334564 Linear Regression
##  polar     1.2500811 0.11861034 343.20 0.9324554 1.614178 Linear Regression
##  lifesat   1.7190114 0.10504230 327.60 1.4303092 2.034235 Linear Regression
##  negaffect 0.8688873 0.10551616 345.58 0.5921241 1.198558 Linear Regression
##  posaffect 0.8038251 0.10148878 345.58 0.5386330 1.121925 Linear Regression
##  iafric    0.9775742 0.11822787 346.55 0.6673525 1.346831 Linear Regression
##  iasian    1.3868593 0.12896647 344.32 1.0410058 1.782233 Linear Regression
##  igend     1.9595877 0.17792498 346.99 1.4816583 2.504216 Linear Regression
##  eafric    0.9252382 0.12222470 346.94 0.6073485 1.309789 Linear Regression
##  easian    1.8019736 0.15735218 346.59 1.3781662 2.282513 Linear Regression
##  egend     1.5970939 0.16415893 346.76 1.1602001 2.103654 Linear Regression
##  Date                   
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
##  Tournament 1 (May 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the sqrt scale
#here are the results of the linear regression tests for Tournament 1

plot.t1.linreg<-plot(emmeans(model.phase1.linreg.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Linear Regression")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
##examine if better than all 3 benchmarks

phase1_exp_wbench$phase1_exp_wbench_topBenchmark <- pmin(phase1_exp_wbench$'Benchmark 1',phase1_exp_wbench$'Benchmark 2',phase1_exp_wbench$'Benchmark 3')
phase1_exp_wbench$MASE_ratio1_topBenchmark<- phase1_exp_wbench$phase1_exp_wbench_topBenchmark/phase1_exp_wbench$MASE1_w1

#skewness test suggests that sqrt is the most reasonable transformation  hence we will use it.
model.phase1.topBenchmark.ratio<-  lmer(sqrt(MASE_ratio1_topBenchmark)~domain+(1|team_name), data=phase1_exp_wbench)
Anova(model.phase1.topBenchmark.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: sqrt(MASE_ratio1_topBenchmark)
##             F Df Df.res        Pr(>F)    
## domain 5.7183 11 317.85 0.00000002007 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(model.phase1.topBenchmark.ratio,~domain, type="response")%>% rbind(adjust="mvt")
##  domain    response     SE  df lower.CL upper.CL
##  ideolrep     0.934 0.0784 344    0.722     1.17
##  ideoldem     1.013 0.0816 344    0.791     1.26
##  polar        1.252 0.0935 345    0.997     1.53
##  lifesat      0.801 0.0562 338    0.647     0.97
##  negaffect    0.883 0.0839 346    0.658     1.14
##  posaffect    0.817 0.0807 346    0.602     1.07
##  iafric       0.977 0.0933 347    0.727     1.26
##  iasian       0.769 0.0757 346    0.567     1.00
##  igend        1.137 0.1070 347    0.850     1.47
##  eafric       0.958 0.0983 347    0.696     1.26
##  easian       0.782 0.0818 347    0.564     1.03
##  egend        1.521 0.1270 347    1.179     1.91
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the sqrt scale
plot.t1.topBenchmark<-plot(emmeans(model.phase1.topBenchmark.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Top Naive Benchmark")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 1 (May 2020)")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
#Tournament 2

phase2_exp_wbench<-dat_phase2 %>% left_join(pivot_wider(sim.w2 %>% dplyr::select(domain,response, source),
                                            names_from="source",values_from="response"))
## Joining with `by = join_by(domain)`
phase2_exp_wbench$domain <- factor(phase2_exp_wbench$domain,      # Reordering group factor levels
                         levels = c("ideolrep","ideoldem","polar",
                                    "lifesat","negaffect","posaffect",
                                    "iafric","iasian","igend",
                                    "eafric","easian","egend" ))

phase2_exp_wbench$MASE_ratio1<- phase2_exp_wbench$'Benchmark 1'/phase2_exp_wbench$MASE1_w2
phase2_exp_wbench$MASE_ratio2<- phase2_exp_wbench$'Benchmark 2'/phase2_exp_wbench$MASE1_w2
phase2_exp_wbench$MASE_ratio3<- phase2_exp_wbench$'Benchmark 3'/phase2_exp_wbench$MASE1_w2

###Test performance against all three benchmarks in a linear mixed model - ratio scores nested in participants

phase2_exp_wbench_long<-phase2_exp_wbench %>% pivot_longer(MASE_ratio1:MASE_ratio3,names_to="Benchmark",values_to="scores")
#use logs due to proportion scores
model.phase2.all.ratio<-  lmer(log(scores)~domain*Benchmark+(1|team_name), data=phase2_exp_wbench_long)

Anova(model.phase2.all.ratio, type=3)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(scores)
##                     Chisq Df            Pr(>Chisq)    
## (Intercept)       11.4820  1             0.0007027 ***
## domain           126.3706 11 < 0.00000000000000022 ***
## Benchmark          6.8664  2             0.0322833 *  
## domain:Benchmark 286.1221 22 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(model.phase2.all.ratio,~domain, type="response")%>% rbind(adjust="mvt")
## NOTE: Results may be misleading due to involvement in interactions
##  domain    response     SE  df lower.CL upper.CL
##  ideolrep     1.216 0.0955 286    0.981     1.51
##  ideoldem     0.946 0.0743 286    0.763     1.17
##  polar        0.718 0.0564 286    0.579     0.89
##  lifesat      1.507 0.1050 189    1.243     1.83
##  negaffect    1.011 0.0777 268    0.819     1.25
##  posaffect    1.324 0.1020 268    1.072     1.64
##  iafric       1.812 0.1500 346    1.445     2.27
##  iasian       1.857 0.1490 312    1.489     2.31
##  igend        2.168 0.1790 344    1.730     2.72
##  eafric       2.211 0.1890 388    1.751     2.79
##  easian       2.221 0.1830 344    1.772     2.78
##  egend        1.211 0.1010 352    0.964     1.52
## 
## Results are averaged over some or all of the levels of: Benchmark 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the log scale
#three out of 12 domains show no significant difference from zero (includes 1)

emmeans(model.phase2.all.ratio,~1, type="response")
##  1       response     SE  df lower.CL upper.CL
##  overall     1.43 0.0897 123     1.26     1.62
## 
## Results are averaged over the levels of: domain, Benchmark 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
plot.t2.avBenchmark<-plot(emmeans(model.phase2.all.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Domain")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 2 (Nov 2020)")
## NOTE: Results may be misleading due to involvement in interactions
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
###Test individual markers


#here we use logs, because skewness suggests that sqrt is not enough and logs do a good job across all three markers

model.phase2.hist.mean.ratio<-  lmer(log(MASE_ratio1)~domain+(1|team_name), data=phase2_exp_wbench)
Anova(model.phase2.hist.mean.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE_ratio1)
##            F Df Df.res                Pr(>F)    
## domain 9.798 11 469.49 0.0000000000000004571 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#here are the results of the historical mean tests for Tournament 2
phase2.hist.mean.ratio.means<-as.data.frame(emmeans(model.phase2.hist.mean.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase2.hist.mean.ratio.means$Estimate<-"Historical Mean"
phase2.hist.mean.ratio.means$Date<-"Tournament 2 (Nov 2020)"
phase2.hist.mean.ratio.means
##  domain     response        SE     df  lower.CL upper.CL Estimate       
##  ideolrep  1.5203753 0.1562034 506.28 1.1331109 2.039995 Historical Mean
##  ideoldem  1.0111460 0.1038852 506.28 0.7535864 1.356734 Historical Mean
##  polar     0.9386915 0.0963203 506.19 0.6998555 1.259034 Historical Mean
##  lifesat   0.9304218 0.0742710 431.58 0.7402942 1.169379 Historical Mean
##  negaffect 1.2365620 0.1223581 499.84 0.9316334 1.641295 Historical Mean
##  posaffect 1.4997699 0.1484025 499.84 1.1299267 1.990669 Historical Mean
##  iafric    1.8718833 0.2137105 526.65 1.3502817 2.594975 Historical Mean
##  iasian    1.3626424 0.1470760 517.50 1.0006057 1.855670 Historical Mean
##  igend     1.5932396 0.1817240 527.52 1.1496250 2.208035 Historical Mean
##  eafric    2.8630300 0.3482122 532.82 2.0216946 4.054490 Historical Mean
##  easian    1.3934965 0.1588979 527.38 1.0056036 1.931012 Historical Mean
##  egend     1.5564942 0.1796842 528.51 1.1186958 2.165624 Historical Mean
##  Date                   
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the log scale
plot.t2.hist.mean<-plot(emmeans(model.phase2.hist.mean.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(name="")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 2 (Nov 2020)")+theme(axis.text.y=element_blank())
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
model.phase2.randwalk.ratio<-  lmer(log(MASE_ratio2)~domain+(1|team_name), data=phase2_exp_wbench)

Anova(model.phase2.randwalk.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE_ratio2)
##             F Df Df.res                Pr(>F)    
## domain 18.448 11 469.49 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
phase2.randwalk.ratio.means<-as.data.frame(emmeans(model.phase2.randwalk.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase2.randwalk.ratio.means$Estimate<-"Random Walk"
phase2.randwalk.ratio.means$Date<-"Tournament 2 (Nov 2020)"
phase2.randwalk.ratio.means
##  domain    response        SE     df  lower.CL upper.CL Estimate   
##  ideolrep  1.302184 0.1337864 506.28 0.9704974 1.747231 Random Walk
##  ideoldem  1.020584 0.1048548 506.28 0.7606366 1.369367 Random Walk
##  polar     0.943981 0.0968631 506.19 0.7037862 1.266152 Random Walk
##  lifesat   1.879067 0.1499966 431.58 1.4950987 2.361645 Random Walk
##  negaffect 1.150176 0.1138101 499.84 0.8665542 1.526625 Random Walk
##  posaffect 2.256800 0.2233109 499.84 1.7002967 2.995447 Random Walk
##  iafric    2.003021 0.2286824 526.65 1.4448545 2.776815 Random Walk
##  iasian    2.725710 0.2941979 517.50 2.0015213 3.711924 Random Walk
##  igend     3.221499 0.3674424 527.52 2.3245642 4.464517 Random Walk
##  eafric    2.230675 0.2713028 532.82 1.5751517 3.159004 Random Walk
##  easian    3.175373 0.3620820 527.38 2.2914552 4.400258 Random Walk
##  egend     1.548851 0.1788018 528.51 1.1131852 2.155022 Random Walk
##  Date                   
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the log scale
#here are the results of the random walk tests for Tournament 2

plot.t2.randwalk<-plot(emmeans(model.phase2.randwalk.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(name="")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="")+theme(axis.text.y=element_blank())
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
model.phase2.linreg.ratio<-  lmer(log(MASE_ratio3)~domain+(1|team_name), data=phase2_exp_wbench)
Anova(model.phase2.linreg.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE_ratio3)
##             F Df Df.res                Pr(>F)    
## domain 31.062 11 469.49 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
phase2.linreg.ratio.means<-as.data.frame(emmeans(model.phase2.linreg.ratio,~domain, type="response")%>% rbind(adjust="mvt"))
phase2.linreg.ratio.means$Estimate<-"Linear Regression"
phase2.linreg.ratio.means$Date<-"Tournament 2 (Nov 2020)"
phase2.linreg.ratio.means
##  domain     response        SE     df  lower.CL upper.CL Estimate         
##  ideolrep  1.1301798 0.1161147 506.28 0.8423098 1.516433 Linear Regression
##  ideoldem  1.0210289 0.1049005 506.28 0.7609646 1.369972 Linear Regression
##  polar     0.5081725 0.0521442 506.19 0.3788738 0.681597 Linear Regression
##  lifesat   1.7844886 0.1424469 431.58 1.4198496 2.242772 Linear Regression
##  negaffect 0.9556993 0.0945666 499.84 0.7200156 1.268530 Linear Regression
##  posaffect 0.9030385 0.0893559 499.84 0.6803533 1.198611 Linear Regression
##  iafric    1.9800980 0.2260653 526.65 1.4283037 2.745066 Linear Regression
##  iasian    2.3485365 0.2534879 517.50 1.7245570 3.198284 Linear Regression
##  igend     2.4688132 0.2815915 527.52 1.7814549 3.421382 Linear Regression
##  eafric    2.2036377 0.2680145 532.82 1.5560777 3.120679 Linear Regression
##  easian    3.1427059 0.3583570 527.38 2.2678944 4.354965 Linear Regression
##  egend     0.8116076 0.0936933 528.51 0.5833251 1.129228 Linear Regression
##  Date                   
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
##  Tournament 2 (Nov 2020)
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the log scale
#here are the results of the linear regression tests for Tournament 2


plot.t2.linreg<-plot(emmeans(model.phase2.linreg.ratio,~domain, type="response")%>% rbind(adjust="mvt"),comparisons=F, color="black")+scale_y_discrete(name="")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="")+theme(axis.text.y=element_blank())
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
##examine if better than all 3 benchmarks

phase2_exp_wbench$phase2_exp_wbench_topBenchmark <- pmin(phase2_exp_wbench$'Benchmark 1',phase2_exp_wbench$'Benchmark 2',phase2_exp_wbench$'Benchmark 3')
phase2_exp_wbench$MASE_ratio2_topBenchmark<- phase2_exp_wbench$phase2_exp_wbench_topBenchmark/phase2_exp_wbench$MASE1_w2

#skewness test suggests that sqrt is the most reasonable transformation  hence we will use it.
model.phase2.topBenchmark.ratio<-  lmer(sqrt(MASE_ratio2_topBenchmark)~domain+(1|team_name), data=phase2_exp_wbench)
Anova(model.phase2.topBenchmark.ratio, test.statistic = "F")
## Analysis of Deviance Table (Type II Wald F tests with Kenward-Roger df)
## 
## Response: sqrt(MASE_ratio2_topBenchmark)
##             F Df Df.res                Pr(>F)    
## domain 14.994 11 480.95 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.phase2.topBenchmark.ratio.means<-emmeans(model.phase2.topBenchmark.ratio,~domain, type="response")%>% rbind(adjust="mvt")
model.phase2.topBenchmark.ratio.means
##  domain    response     SE  df lower.CL upper.CL
##  ideolrep     1.216 0.1410 523    0.845    1.654
##  ideoldem     1.162 0.1380 523    0.801    1.592
##  polar        0.541 0.0941 524    0.305    0.845
##  lifesat      1.215 0.1080 488    0.925    1.544
##  negaffect    1.010 0.1240 520    0.687    1.396
##  posaffect    0.983 0.1220 520    0.665    1.364
##  iafric       2.383 0.2210 532    1.791    3.059
##  iasian       1.573 0.1690 528    1.125    2.095
##  igend        1.709 0.1870 532    1.215    2.288
##  eafric       2.843 0.2590 534    2.150    3.632
##  easian       1.675 0.1850 533    1.186    2.248
##  egend        0.901 0.1380 533    0.550    1.339
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates 
## Intervals are back-transformed from the sqrt scale
plot.t2.topBenchmark<-plot(model.phase2.topBenchmark.ratio.means,comparisons=F, color="black")+scale_y_discrete(labels=labeling, name="Top Naive Benchmark")+geom_vline(xintercept =1, linetype='dashed', color='black',14)+theme_minimal()+  labs(x="",shape="",color="", title="Tournament 2 (Nov 2020)")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
#combine all graphs
figs2<-ggarrange(plot.t1.hist.mean,plot.t2.hist.mean,
                     plot.t1.randwalk,plot.t2.randwalk, 
                     plot.t1.linreg, plot.t2.linreg,  ncol=2, nrow=3,widths=c(1.3,1))

figs2

####REDO figure in ggplot, sorting the facets differently
# Combine the datasets of means
mean.scores.w.benchmarks <- rbind(phase1.hist.mean.ratio.means,phase1.randwalk.ratio.means,phase1.linreg.ratio.means,
                     phase2.hist.mean.ratio.means,phase2.randwalk.ratio.means,phase2.linreg.ratio.means) 
mean.scores.w.benchmarks$domain<-factor(mean.scores.w.benchmarks$domain,levels=c("iafric","ideolrep","eafric",
  "negaffect", "lifesat","easian","ideoldem","iasian", "polar", "igend","posaffect","egend"))
#### Figure 2 in the manuscript
mean.scores.w.benchmarks %>% ggplot(aes(x = response, y = domain, colour = Estimate, fill=Estimate))+
 geom_pointrange(aes(xmin=lower.CL, xmax=upper.CL), position=pd)+  theme_minimal(base_size = 14) +geom_vline(xintercept =1, linetype='dashed', color='red', 14)+facet_grid(~Date, scales = "free_y")+
theme(legend.position="bottom") +scale_color_aaas()+scale_fill_aaas()+  scale_y_discrete(labels=labeling, name="")+
  labs(colour = "",fill="", y="",x="Naïve Benchmark / Scientific Forecast Error Ratio (M +/- 95%CI)") 
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.

compare scores from tournament 1 and tournament 2

test complexity associations

#count how many domains per person
phase1_exp<-phase1_exp %>%group_by(team_name) %>%  mutate(n_domains = n())
phase1_exp$Domain_Publications<-ifelse(phase1_exp$pub==1,1,ifelse(phase1_exp$pub==2,0,NA))


#count how many domains per person
dat_phase2<-dat_phase2 %>%group_by(team_name) %>%  mutate(n_domains = n())
dat_phase2$Domain_Publications<-ifelse(dat_phase2$pub==1,1,ifelse(dat_phase2$pub==2,0,NA))

#####################
#create subsets for tournament 1 and for tournament 2 to use in analyses here and later for covariate analyses below
#####################
subset1<- phase1_exp %>% ungroup() %>% dplyr::select(MASE1_w1,domain,Method.code,ResponseId,team_name,covidcondyn,CounterFactual_Presence_Final,Method.complex,parameters_coded,n_domains,multi_dis.factor,team_discipline.coded,non_US,team_size.coded,team_gender,team_education,confidence,subexpert,Domain_Publications,previous_tournament.coded,TournamentStart) %>% mutate(inaccuracy = MASE1_w1,phase = "first")

subset2<- dat_phase2 %>% ungroup() %>% dplyr::select(MASE1_w2,domain,Method.code,ResponseId,team_name,covidcondyn,CounterFactual_Presence_Final,Method.complex,parameters_coded,n_domains,multi_dis.factor,team_discipline.coded,non_US,team_size.coded,team_gender,team_education,confidence,subexpert,Domain_Publications,previous_tournament.coded,TournamentStart) %>% mutate(inaccuracy = MASE1_w2,phase = "second")

##compare effects by domain for each tournament
##REPORTED IN MAIN TEXT####
subset1.model<-  lmer(log(inaccuracy)~domain+(1|team_name), data=subset1)
car::Anova(subset1.model,type="III", test.statistic="F") #sig effect
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(inaccuracy)
##                   F Df Df.res                Pr(>F)    
## (Intercept) 203.252  1 345.54 < 0.00000000000000022 ***
## domain       41.882 11 295.69 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(subset1.model,~domain, type="response")
##  domain    response    SE  df lower.CL upper.CL
##  eafric        4.89 0.544 346    3.928     6.09
##  easian        3.62 0.376 339    2.954     4.44
##  egend         1.10 0.124 346    0.876     1.37
##  iafric        6.30 0.667 340    5.114     7.76
##  iasian        2.27 0.223 328    1.872     2.75
##  ideoldem      3.13 0.292 315    2.608     3.76
##  ideolrep      5.46 0.509 315    4.541     6.56
##  igend         1.44 0.160 345    1.155     1.79
##  lifesat       4.05 0.302 257    3.494     4.69
##  negaffect     4.02 0.405 335    3.297     4.90
##  polar         1.94 0.186 321    1.610     2.34
##  posaffect     1.31 0.132 335    1.077     1.60
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
partR2(subset1.model)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.4498 NA       NA       1     12 
## 
## ----------
## 
## Part (semi-partial) R2:
## [1] "No partitions selected."
#rsq = 0.4498

subset2.model<-  lmer(log(inaccuracy)~domain+(1|team_name), data=subset2)
car::Anova(subset2.model,type="III", test.statistic="F") #sig effect
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(inaccuracy)
##                  F Df Df.res                Pr(>F)    
## (Intercept) 48.476  1 526.65     0.000000000009997 ***
## domain      26.872 11 469.49 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(subset2.model,~domain, type="response")
##  domain    response     SE  df lower.CL upper.CL
##  iafric       2.214 0.2530 527    1.769    2.771
##  ideolrep     4.048 0.4160 506    3.308    4.954
##  eafric       1.688 0.2050 533    1.329    2.144
##  negaffect    2.614 0.2590 500    2.152    3.175
##  lifesat      2.210 0.1760 432    1.889    2.586
##  easian       1.514 0.1730 527    1.210    1.895
##  ideoldem     2.516 0.2580 506    2.056    3.079
##  iasian       1.023 0.1100 518    0.827    1.264
##  polar        2.258 0.2320 506    1.846    2.762
##  igend        0.913 0.1040 528    0.730    1.143
##  posaffect    0.794 0.0786 500    0.654    0.965
##  egend        1.197 0.1380 529    0.954    1.502
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
partR2(subset2.model)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.2909 NA       NA       1     12 
## 
## ----------
## 
## Part (semi-partial) R2:
## [1] "No partitions selected."
#rsq = 0.2909

###########################

##############################
#combine tournament 1 and tournament 2 subsets for later analyses with covariates
#BEGINNING
##############################
both.sets<-bind_rows(subset1,subset2)

both.sets$covidconditional<-ifelse(both.sets$covidcondyn==1,1,0)
both.sets$covidconditional[is.na(both.sets$covidconditional)]<-0
both.sets$Method.complex[is.na(both.sets$Method.complex)]<-1 #simple when no extra info is provided, because the rest {number of parameters et.) suggests no extra factors considered}
both.sets$multi_dis.factor[is.na(both.sets$multi_dis.factor)]<-"Single domain expertise" #(setting is NA to non multidisciplinary)
both.sets$team_discipline.coded[is.na(both.sets$team_discipline.coded)]<-5 #(setting is NA to other)

both.sets$team_discipline.datasci<-ifelse(both.sets$team_discipline.coded==3,1,0)
both.sets$team_discipline.SBsci<-ifelse(both.sets$team_discipline.coded==1,1,ifelse(both.sets$team_discipline.coded==2,1,0))

#add complexity

both.sets<-both.sets %>% ungroup() %>% left_join(complexity)
## Joining with `by = join_by(domain)`
both.sets$sd_hist<-ifelse(both.sets$phase=="first",both.sets$sd_hist_w1,both.sets$sd_hist_w2)
both.sets$mad_hist<-ifelse(both.sets$phase=="first",both.sets$mad_hist_w1,both.sets$mad_hist_w2)
both.sets$perp_entropy_hist<-ifelse(both.sets$phase=="first",both.sets$perp_entropy_hist_w1,both.sets$perp_entropy_hist_w2)

both.sets$sd<-ifelse(both.sets$phase=="first",both.sets$sd_w1,both.sets$sd_w2)
both.sets$mad<-ifelse(both.sets$phase=="first",both.sets$mad_w1,both.sets$mad_w2)
both.sets$perp_entropy<-ifelse(both.sets$phase=="first",both.sets$perp_entropy_w1,both.sets$perp_entropy_w2)

#add domain differences in complexity between waves (just supplementary interests)
both.sets$sd_hist_diff<-both.sets$sd_hist_w2-both.sets$sd_hist_w1
both.sets$mad_hist_diff<-both.sets$mad_hist_w2-both.sets$mad_hist_w1
both.sets$perp_entropy_hist_diff<-both.sets$perp_entropy_hist_w2-both.sets$perp_entropy_hist_w1

both.sets$sd_diff<-both.sets$sd_w2-both.sets$sd_w1
both.sets$mad_diff<-both.sets$mad_w2-both.sets$mad_w1
both.sets$perp_entropy_diff<-both.sets$perp_entropy_w2-both.sets$perp_entropy_w1
##############################
#combine tournament 1 and tournament 2 subsets for later analyses with covariates
#END
##############################

#############################
#analyze comparison of tournament 1 to tournament 2, REPORTED IN THE MAIN TEXT
#BEGINNING
#############################

both.sets.model<-  lmer(log(inaccuracy)~phase+(1|team_name), data=both.sets)
car::Anova(both.sets.model,type="III", test.statistic="F") #sig effect
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(inaccuracy)
##                  F Df Df.res                Pr(>F)    
## (Intercept) 339.93  1 225.83 < 0.00000000000000022 ***
## phase        64.59  1 889.48  0.000000000000002923 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(both.sets.model,~phase, type="response")
##  phase  response     SE  df lower.CL upper.CL
##  first      2.87 0.1640 226     2.56     3.21
##  second     1.84 0.0918 150     1.67     2.03
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
partR2(both.sets.model)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.0628 NA       NA       1     2  
## 
## ----------
## 
## Part (semi-partial) R2:
## [1] "No partitions selected."
#effect size part rsq  0.0628

#####################
#supplementary analyses - comparison of tournament 1 versus tournament 2 by domain
#####################
both.sets.model.by.domain<-  lmer(log(inaccuracy)~phase*domain+(1|team_name), data=both.sets)
car::Anova(both.sets.model.by.domain,type="III", test.statistic="F") #sig effect
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(inaccuracy)
##                     F Df Df.res                Pr(>F)    
## (Intercept)  138.8956  1 878.58 < 0.00000000000000022 ***
## phase         37.5136  1 776.55      0.00000000144130 ***
## domain        27.6114 11 798.31 < 0.00000000000000022 ***
## phase:domain   7.1193 11 775.40      0.00000000001335 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(both.sets.model.by.domain,~phase|domain, type="response")
## domain = eafric:
##  phase  response     SE  df lower.CL upper.CL
##  first     4.677 0.6120 879    3.618    6.048
##  second    1.787 0.1990 840    1.435    2.224
## 
## domain = easian:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.415 0.4160 867    2.688    4.338
##  second    1.592 0.1670 810    1.295    1.956
## 
## domain = egend:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.041 0.1390 880    0.800    1.353
##  second    1.231 0.1310 813    0.999    1.517
## 
## domain = iafric:
##  phase  response     SE  df lower.CL upper.CL
##  first     5.924 0.7360 870    4.642    7.560
##  second    2.338 0.2460 809    1.902    2.873
## 
## domain = iasian:
##  phase  response     SE  df lower.CL upper.CL
##  first     2.195 0.2520 847    1.751    2.750
##  second    1.070 0.1070 775    0.879    1.301
## 
## domain = ideoldem:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.046 0.3330 823    2.458    3.775
##  second    2.607 0.2480 740    2.163    3.143
## 
## domain = ideolrep:
##  phase  response     SE  df lower.CL upper.CL
##  first     5.304 0.5790 823    4.281    6.572
##  second    4.195 0.3990 740    3.480    5.057
## 
## domain = igend:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.373 0.1800 878    1.061    1.776
##  second    0.947 0.0995 810    0.771    1.164
## 
## domain = lifesat:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.690 0.3240 666    3.105    4.386
##  second    2.197 0.1660 534    1.894    2.549
## 
## domain = negaffect:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.747 0.4430 860    2.972    4.725
##  second    2.723 0.2500 715    2.273    3.262
## 
## domain = polar:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.925 0.2160 831    1.544    2.399
##  second    2.309 0.2200 735    1.915    2.784
## 
## domain = posaffect:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.225 0.1450 860    0.971    1.544
##  second    0.827 0.0761 715    0.691    0.991
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
emmeans(both.sets.model.by.domain,pairwise~phase|domain, type="response")
## $emmeans
## domain = eafric:
##  phase  response     SE  df lower.CL upper.CL
##  first     4.677 0.6120 879    3.618    6.048
##  second    1.787 0.1990 840    1.435    2.224
## 
## domain = easian:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.415 0.4160 867    2.688    4.338
##  second    1.592 0.1670 810    1.295    1.956
## 
## domain = egend:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.041 0.1390 880    0.800    1.353
##  second    1.231 0.1310 813    0.999    1.517
## 
## domain = iafric:
##  phase  response     SE  df lower.CL upper.CL
##  first     5.924 0.7360 870    4.642    7.560
##  second    2.338 0.2460 809    1.902    2.873
## 
## domain = iasian:
##  phase  response     SE  df lower.CL upper.CL
##  first     2.195 0.2520 847    1.751    2.750
##  second    1.070 0.1070 775    0.879    1.301
## 
## domain = ideoldem:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.046 0.3330 823    2.458    3.775
##  second    2.607 0.2480 740    2.163    3.143
## 
## domain = ideolrep:
##  phase  response     SE  df lower.CL upper.CL
##  first     5.304 0.5790 823    4.281    6.572
##  second    4.195 0.3990 740    3.480    5.057
## 
## domain = igend:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.373 0.1800 878    1.061    1.776
##  second    0.947 0.0995 810    0.771    1.164
## 
## domain = lifesat:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.690 0.3240 666    3.105    4.386
##  second    2.197 0.1660 534    1.894    2.549
## 
## domain = negaffect:
##  phase  response     SE  df lower.CL upper.CL
##  first     3.747 0.4430 860    2.972    4.725
##  second    2.723 0.2500 715    2.273    3.262
## 
## domain = polar:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.925 0.2160 831    1.544    2.399
##  second    2.309 0.2200 735    1.915    2.784
## 
## domain = posaffect:
##  phase  response     SE  df lower.CL upper.CL
##  first     1.225 0.1450 860    0.971    1.544
##  second    0.827 0.0761 715    0.691    0.991
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale 
## 
## $contrasts
## domain = eafric:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 2.618 0.411 777    1   6.125  <.0001
## 
## domain = easian:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 2.145 0.311 776    1   5.260  <.0001
## 
## domain = egend:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 0.845 0.132 783    1  -1.073  0.2834
## 
## domain = iafric:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 2.534 0.372 776    1   6.337  <.0001
## 
## domain = iasian:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 2.052 0.278 778    1   5.296  <.0001
## 
## domain = ideoldem:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.168 0.149 776    1   1.219  0.2232
## 
## domain = ideolrep:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.264 0.162 776    1   1.837  0.0666
## 
## domain = igend:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.449 0.222 777    1   2.426  0.0155
## 
## domain = lifesat:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.679 0.165 792    1   5.275  <.0001
## 
## domain = negaffect:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.376 0.185 787    1   2.373  0.0179
## 
## domain = polar:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 0.834 0.109 783    1  -1.395  0.1633
## 
## domain = posaffect:
##  contrast       ratio    SE  df null t.ratio p.value
##  first / second 1.480 0.199 787    1   2.914  0.0037
## 
## Degrees-of-freedom method: kenward-roger 
## Tests are performed on the log scale
t.comparison.effects<-as.data.frame(emmeans(both.sets.model.by.domain,pairwise~phase|domain, type="response")$emmeans)
t.comparison<-as.data.frame(emmeans(both.sets.model.by.domain,pairwise~phase|domain, type="response")$contrasts)
#####################

#####################
#analyses of tournament 1 versus tournament 2 with covariates
#####################
both.sets.model.cov<-  lmer(log(inaccuracy)~phase+domain+
                              n_domains+team_discipline.datasci+team_discipline.SBsci+multi_dis.factor+team_size.coded+team_gender+team_education+Domain_Publications+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(both.sets.model.cov,type="III", test.statistic="F") #sig effect
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(inaccuracy)
##                                 F Df Df.res                Pr(>F)    
## (Intercept)               52.0045  1 151.28      0.00000000002473 ***
## phase                     90.4478  1 847.79 < 0.00000000000000022 ***
## domain                    47.7820 11 833.72 < 0.00000000000000022 ***
## n_domains                  4.9718  1 130.12              0.027479 *  
## team_discipline.datasci    1.3287  1 219.62              0.250292    
## team_discipline.SBsci      2.9431  1 230.07              0.087591 .  
## multi_dis.factor           2.3465  1 216.88              0.127024    
## team_size.coded            0.0079  1  87.98              0.929260    
## team_gender                2.8957  1 109.50              0.091655 .  
## team_education             0.7948  1 101.93              0.374764    
## Domain_Publications        6.7055  1 773.33              0.009792 ** 
## previous_tournament.coded  4.9745  1  96.96              0.028029 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(both.sets.model.cov,~phase, type="response")
##  phase  response    SE  df lower.CL upper.CL
##  first      1.55 0.465 211    0.857     2.80
##  second     1.02 0.307 209    0.568     1.85
## 
## Results are averaged over the levels of: domain, team_discipline.datasci, team_discipline.SBsci, multi_dis.factor, Domain_Publications, previous_tournament.coded 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
partR2(both.sets.model.cov, partvars = 
         c("phase","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3848 NA       NA       1     22 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s) R2     CI_lower CI_upper nboot ndf
##  Model        0.3848 NA       NA       1     22 
##  phase        0.0617 NA       NA       1     21 
##  domain       0.2680 NA       NA       1     11 
##  phase+domain 0.3282 NA       NA       1     10
#ergo, part rsq for phase itself remains 0.0617

#graph change in ranking

##MAIN TEXT FIGURE SHOWING RANKING AND DOMAIN'S MASE, AS WELL AS SHOWING WHICH DIFFERENCES ARE SIG

#get ranking of scores among academics in May and November
median.MASE.t1$phase<-"first"
median.MASE.t2$phase<-"second"
median.MASE.t1$Wave<-"First Tournament\nMay 2020"
median.MASE.t2$Wave<-"Second Tournament\nNov 2020"
median.ranks<-bind_rows(median.MASE.t1,median.MASE.t2)
median.ranks$Domain[median.ranks$domain=="eafric"]<-"Exp. Afr.-Am. Bias"
## Warning: Unknown or uninitialised column: `Domain`.
median.ranks$Domain[median.ranks$domain=="easian"]<-"Exp. Asian-Am. Bias"
median.ranks$Domain[median.ranks$domain=="egend"]<-"Exp. Gender Bias"
median.ranks$Domain[median.ranks$domain=="iafric"]<-"Imp. Afr.-Am. Bias"
median.ranks$Domain[median.ranks$domain=="iasian"]<-"Imp. Asian-Am. Bias"
median.ranks$Domain[median.ranks$domain=="ideoldem"]<-"Democrat. Support"
median.ranks$Domain[median.ranks$domain=="ideolrep"]<-"Republic. Support"
median.ranks$Domain[median.ranks$domain=="igend"]<-"Imp. Gender Bias"
median.ranks$Domain[median.ranks$domain=="lifesat"]<-"Life Satisfaction"
median.ranks$Domain[median.ranks$domain=="polar"]<-"Polarization"
median.ranks$Domain[median.ranks$domain=="negaffect"]<-"Negative Affect"
median.ranks$Domain[median.ranks$domain=="posaffect"]<-"Positive Affect"

median.ranks<-median.ranks %>% left_join(t.comparison.effects) %>% left_join(t.comparison %>% dplyr::select(domain,ratio,t.ratio,p.value)) #add the sig testing from the tournament comparisons, incl ratio size and p-values
## Joining with `by = join_by(domain, phase)`
## Joining with `by = join_by(domain)`
#NOTE: here we have median ranks per domain, but also the estimates scores from multi-level models accounting for multiple predictions by different scientist groups. Due to this dependence in the data, we use the latter estimates.


### Figure 3 in the manuscript
median.ranks$sig<-ifelse(median.ranks$p.value<.05,"eff","noeff")
median.ranks$MASE<-round(median.ranks$response,2) #two decimals
median.ranks$ranksize<-round(median.ranks$ratio,2) #two decimals
figure_3<-newggslopegraph(dataframe = median.ranks,
                Times = Wave,
                Measurement = MASE,
                Grouping = Domain,LineThickness = 2,
                WiderLabels=T,
                Title = "Which domains are harder to predict?",TitleJustify = "center",
                SubTitle = NULL,
                Caption = "Ranking based on MASE scores per domain",
                ThemeChoice="ipsum")+scale_color_d3(palette = "category20")+geom_line(aes(linetype=sig, color="black",alpha=1))

figure_3
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Consistency in forecasting

#to assess and protect against the possibility that forecasting models are accurate by chance (in the same way that some investing strategies can “get lucky” in a particular time point without actually being better than other strategies), we used subsets of the data (odd and event months) to determine whether model accuracy in one subset of predictions (ranking of model performance across odd months) correlates with model accuracy in the other subset (ranking of model performance across even months). 

dat_long_phase1_exp<-(subset(dat_long_phase1, isExpert.factor == 'Academic'))

dat_long_phase1_exp_wide_by_month<-dat_long_phase1_exp %>%dplyr::select(domain,team_name,Month,value.dif) %>%  pivot_wider(names_from=c(Month),values_from=c(value.dif))

dat_long_phase1_exp_wide_by_month$odd_month_inaccuracy=rowMeans(dat_long_phase1_exp_wide_by_month[c("1","3","5","7","9","11")],na.rm=T)

dat_long_phase1_exp_wide_by_month$even_month_inaccuracy=rowMeans(dat_long_phase1_exp_wide_by_month[c("2","4","6","8","10","12")],na.rm=T)

#correlations by domain

dat_long_phase1_exp_wide_by_month %>%dplyr::select(domain,odd_month_inaccuracy,even_month_inaccuracy) %>% 
    group_by(domain) %>%
    correlation::correlation(ranktransform =T)    
## # Correlation Matrix (pearson-method)
## 
## Group     |           Parameter1 |            Parameter2 |    r |       95% CI
## ------------------------------------------------------------------------------
## egend     | odd_month_inaccuracy | even_month_inaccuracy | 0.62 | [0.26, 0.83]
## easian    | odd_month_inaccuracy | even_month_inaccuracy | 0.97 | [0.94, 0.99]
## eafric    | odd_month_inaccuracy | even_month_inaccuracy | 0.99 | [0.97, 0.99]
## igend     | odd_month_inaccuracy | even_month_inaccuracy | 0.73 | [0.45, 0.88]
## iasian    | odd_month_inaccuracy | even_month_inaccuracy | 0.86 | [0.72, 0.93]
## iafric    | odd_month_inaccuracy | even_month_inaccuracy | 0.55 | [0.20, 0.78]
## posaffect | odd_month_inaccuracy | even_month_inaccuracy | 0.54 | [0.21, 0.76]
## negaffect | odd_month_inaccuracy | even_month_inaccuracy | 0.89 | [0.77, 0.95]
## lifesat   | odd_month_inaccuracy | even_month_inaccuracy | 0.94 | [0.91, 0.97]
## polar     | odd_month_inaccuracy | even_month_inaccuracy | 0.73 | [0.51, 0.86]
## ideoldem  | odd_month_inaccuracy | even_month_inaccuracy | 0.93 | [0.86, 0.97]
## ideolrep  | odd_month_inaccuracy | even_month_inaccuracy | 0.96 | [0.92, 0.98]
## 
## Group     |     t | df |         p
## ----------------------------------
## egend     |  3.45 | 19 | 0.003**  
## easian    | 21.07 | 24 | < .001***
## eafric    | 28.11 | 20 | < .001***
## igend     |  4.79 | 20 | < .001***
## iasian    |  8.89 | 28 | < .001***
## iafric    |  3.16 | 23 | 0.004**  
## posaffect |  3.27 | 26 | 0.003**  
## negaffect |  9.72 | 26 | < .001***
## lifesat   | 21.30 | 55 | < .001***
## polar     |  5.83 | 30 | < .001***
## ideoldem  | 14.39 | 32 | < .001***
## ideolrep  | 19.10 | 32 | < .001***
## 
## p-value adjustment method: Holm (1979)
## Observations: 21-57
#multilevel
dat_long_phase1_exp_wide_by_month %>%dplyr::select(domain,odd_month_inaccuracy,even_month_inaccuracy) %>% 
    correlation::correlation(multilevel=T,ranktransform=T)  
## Parameter1           |            Parameter2 |    r |           CI | t(357) |         p
## ---------------------------------------------------------------------------------------
## odd_month_inaccuracy | even_month_inaccuracy | 0.88 | [0.85, 0.90] |  34.80 | < .001***
## 
## Observations: 359
###PHASE 2

dat_long$Month7<-dat_long$Month-7
dat_long_phase2<-dat_long %>%filter(!(phase == 1 & revised == 1)& Method.code!="Ground Truth"& Method.code!="Naive-linear"&Method.code!="Naive-rfw" & Month %in% c(7,8,9,10,11,12))



dat_long_phase2_exp_wide_by_month<-dat_long_phase2 %>%dplyr::select(domain,team_name,Month7,value.dif) %>%  pivot_wider(names_from=c(Month7),values_from=c(value.dif))

dat_long_phase2_exp_wide_by_month$even_month_inaccuracy=rowMeans(dat_long_phase2_exp_wide_by_month[c("1","3","5")],na.rm=T) # Evie: I think these are even months since previously the month was subtracted by 7

dat_long_phase2_exp_wide_by_month$odd_month_inaccuracy=rowMeans(dat_long_phase2_exp_wide_by_month[c("2","4","0")],na.rm=T) # Evie: I think these are odd months since previously the month was subtracted by 7

#correlations by domain

dat_long_phase2_exp_wide_by_month %>%dplyr::select(domain,odd_month_inaccuracy,even_month_inaccuracy) %>% 
    group_by(domain) %>%
    correlation::correlation(ranktransform =T)    
## # Correlation Matrix (pearson-method)
## 
## Group     |           Parameter1 |            Parameter2 |    r |        95% CI
## -------------------------------------------------------------------------------
## egend     | odd_month_inaccuracy | even_month_inaccuracy | 0.81 | [ 0.65, 0.90]
## easian    | odd_month_inaccuracy | even_month_inaccuracy | 0.92 | [ 0.85, 0.96]
## eafric    | odd_month_inaccuracy | even_month_inaccuracy | 0.96 | [ 0.93, 0.98]
## igend     | odd_month_inaccuracy | even_month_inaccuracy | 0.59 | [ 0.33, 0.77]
## iasian    | odd_month_inaccuracy | even_month_inaccuracy | 0.85 | [ 0.74, 0.92]
## iafric    | odd_month_inaccuracy | even_month_inaccuracy | 0.91 | [ 0.83, 0.95]
## posaffect | odd_month_inaccuracy | even_month_inaccuracy | 0.63 | [ 0.43, 0.77]
## negaffect | odd_month_inaccuracy | even_month_inaccuracy | 0.24 | [-0.04, 0.49]
## lifesat   | odd_month_inaccuracy | even_month_inaccuracy | 0.96 | [ 0.93, 0.97]
## polar     | odd_month_inaccuracy | even_month_inaccuracy | 0.71 | [ 0.53, 0.83]
## ideoldem  | odd_month_inaccuracy | even_month_inaccuracy | 0.77 | [ 0.63, 0.87]
## ideolrep  | odd_month_inaccuracy | even_month_inaccuracy | 0.78 | [ 0.64, 0.87]
## 
## Group     |     t | df |         p
## ----------------------------------
## egend     |  7.92 | 34 | < .001***
## easian    | 14.14 | 35 | < .001***
## eafric    | 19.88 | 30 | < .001***
## igend     |  4.32 | 35 | < .001***
## iasian    | 10.41 | 40 | < .001***
## iafric    | 13.11 | 35 | < .001***
## posaffect |  5.66 | 49 | < .001***
## negaffect |  1.75 | 49 | 0.087    
## lifesat   | 29.13 | 80 | < .001***
## polar     |  6.72 | 45 | < .001***
## ideoldem  |  8.21 | 45 | < .001***
## ideolrep  |  8.44 | 45 | < .001***
## 
## p-value adjustment method: Holm (1979)
## Observations: 32-82
#multilevel
dat_long_phase2_exp_wide_by_month %>%dplyr::select(domain,odd_month_inaccuracy,even_month_inaccuracy) %>% 
    correlation::correlation(multilevel=T,ranktransform=T) 
## Parameter1           |            Parameter2 |    r |           CI | t(544) |         p
## ---------------------------------------------------------------------------------------
## odd_month_inaccuracy | even_month_inaccuracy | 0.72 | [0.67, 0.75] |  23.95 | < .001***
## 
## Observations: 546
### Compare performance from PHASE 1 and PHASE 2 to see if higher scores in Tournament 1 correspond to higher scores in Tournament 2

#obviously, we have to take people from Tournament 1 and examine their scores in Tournament 1 and Tournament 2

##correlation of errors between T1 and T2 (38 groups)
model.phase2.predicted.by.phase1.academ<-  lmer(scale(log(MASE1_w2))~scale(log(MASE1_w1))*domain+(1|ResponseId), data=dat_phase2)
summ(model.phase2.predicted.by.phase1.academ, digits=4) #icc is 866
Observations 179
Dependent variable scale(log(MASE1_w2))
Type Mixed effects linear regression
AIC 168.8406
BIC 251.7126
Pseudo-R² (fixed effects) 0.8969
Pseudo-R² (total) 0.9047
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.2016 0.4951 2.4269 150.3459 0.0164
scale(log(MASE1_w1)) -0.1759 0.5866 -0.2999 150.3934 0.7647
domainideolrep -0.4953 0.5047 -0.9813 150.5064 0.3280
domaineafric -0.7362 0.5100 -1.4436 140.1834 0.1511
domainnegaffect -0.6649 0.5187 -1.2819 149.4182 0.2019
domainlifesat -0.7051 0.4982 -1.4155 149.5140 0.1590
domaineasian -0.5588 0.5013 -1.1147 145.2991 0.2668
domainideoldem -1.2804 0.4992 -2.5649 149.2003 0.0113
domainiasian -0.5760 0.5385 -1.0695 148.4168 0.2866
domainpolar -0.6584 0.5077 -1.2969 152.3912 0.1966
domainigend -0.7920 0.6425 -1.2328 152.3259 0.2196
domainposaffect -0.5728 0.5911 -0.9690 144.4750 0.3342
domainegend -0.1411 0.5728 -0.2464 154.2718 0.8057
scale(log(MASE1_w1)):domainideolrep 1.0274 0.5980 1.7179 152.0060 0.0878
scale(log(MASE1_w1)):domaineafric 1.2651 0.6178 2.0477 140.4399 0.0425
scale(log(MASE1_w1)):domainnegaffect 0.7363 0.6780 1.0859 152.7377 0.2792
scale(log(MASE1_w1)):domainlifesat 1.0769 0.5880 1.8314 150.1703 0.0690
scale(log(MASE1_w1)):domaineasian 1.0839 0.6122 1.7705 149.1651 0.0787
scale(log(MASE1_w1)):domainideoldem 1.1404 0.6087 1.8736 151.5617 0.0629
scale(log(MASE1_w1)):domainiasian 1.0349 0.6406 1.6157 150.3217 0.1083
scale(log(MASE1_w1)):domainpolar 1.1254 0.6051 1.8598 143.2226 0.0650
scale(log(MASE1_w1)):domainigend 0.9746 0.6470 1.5063 140.6425 0.1342
scale(log(MASE1_w1)):domainposaffect 1.3058 0.6683 1.9538 154.6232 0.0525
scale(log(MASE1_w1)):domainegend 1.3209 0.6079 2.1730 145.0762 0.0314
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.0853
Residual 0.2982
Grouping Variables
Group # groups ICC
ResponseId 48 0.0756
emtrends(model.phase2.predicted.by.phase1.academ,"domain",var = "MASE1_w1"                        ) %>% rbind(adjust="mvt") #in nine domains, forecasting error scores from Tournament 1 predict forecasting error scores from Tournament 2
##  domain    MASE1_w1.trend     SE  df lower.CL upper.CL
##  iafric           -0.0502 0.1690 152  -0.5404    0.440
##  ideolrep          0.2429 0.0319 155   0.1505    0.335
##  eafric            0.3107 0.0611 149   0.1335    0.488
##  negaffect         0.1598 0.0968 154  -0.1208    0.440
##  lifesat           0.2570 0.0125 153   0.2206    0.293
##  easian            0.2590 0.0518 154   0.1090    0.409
##  ideoldem          0.2751 0.0462 155   0.1412    0.409
##  iasian            0.2450 0.0745 154   0.0291    0.461
##  polar             0.2708 0.0476 154   0.1329    0.409
##  igend             0.2278 0.0822 146  -0.0106    0.466
##  posaffect         0.3223 0.0895 155   0.0627    0.582
##  egend             0.3266 0.0497 153   0.1826    0.471
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates
model.phase2.predicted.by.phase1.academ.diff<-  lmer(scale((MASE1_w1-MASE1_w2))~scale(log(MASE1_w1))*domain+(1|ResponseId), data=dat_phase2) #effect of inaccuracy in the 12m tournament for changes in inaccuracy between 12 and 6 m tournaments
emtrends(model.phase2.predicted.by.phase1.academ.diff,"domain",var = "MASE1_w1"                        ) %>% rbind(adjust="mvt")
##  domain    MASE1_w1.trend     SE  df lower.CL upper.CL
##  iafric           0.79830 0.2550 109   0.0561    1.540
##  ideolrep        -0.02190 0.0517 112  -0.1725    0.129
##  eafric          -0.01154 0.0901 109  -0.2741    0.251
##  negaffect        0.10599 0.1510 111  -0.3333    0.545
##  lifesat          0.24945 0.0324 154   0.1555    0.343
##  easian           0.02999 0.0807 111  -0.2053    0.265
##  ideoldem        -0.07841 0.0745 112  -0.2953    0.139
##  iasian           0.03271 0.1170 111  -0.3083    0.374
##  polar            0.02521 0.0768 113  -0.1984    0.249
##  igend           -0.01277 0.1200 109  -0.3616    0.336
##  posaffect       -0.00707 0.1490 114  -0.4407    0.427
##  egend           -0.09577 0.0787 112  -0.3250    0.133
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 12 estimates

visualize by method (phase 1 and 2)

#analyses of phase 1  - MASE overall
#For models evaluating overall accuracy of the forecasted model, we will use forecasting type (purely theoretical, purely data-driven and hybrid models), forecasting domain as predictors, with MASE scores nested within teams. 

######################################
#what is the percentage using different method?

#Tournament 1: 
prop.table(table(phase1_exp$Method.code))
## 
##      Data-Driven           Hybrid Intuition/Theory 
##        0.5097493        0.0724234        0.4178273
#Tournament 2: 
prop.table(table(dat_phase2$Method.code))
## 
##      Data-Driven           Hybrid Intuition/Theory 
##       0.53113553       0.08241758       0.38644689
table(phase2$Method.code)
## 
##      Data-Driven     Ground Truth           Hybrid Intuition/Theory 
##              182               12               32              153 
##     Naive-linear        Naive-rfw 
##               12               12
#SUPPLEMENTARY FIGURE showing differences in percentages of each category by domain
######################################
perc.by.domain.phase1<-phase1_exp %>%
  group_by(domain,Method.code) %>%
  summarise(n = n()) %>%
  mutate(perc = round(n / sum(n)*100),2) %>% 
  ggplot(aes(x = "", y = perc, fill = Method.code)) +
  geom_col(color = "black") +
  geom_label(aes(label = perc),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +scale_fill_jama()+labs(fill="")+
  coord_polar(theta = "y")+theme_void()+facet_wrap(~domain, nrow = 4, labeller=labeller(domain=labels))+theme(legend.position="bottom")
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
perc.by.domain.phase1

#Tournament 2

perc.by.domain.phase2<-dat_phase2 %>%
  group_by(domain,Method.code) %>%
  summarise(n = n()) %>%
  mutate(perc = round(n / sum(n)*100),2) %>% 
  ggplot(aes(x = "", y = perc, fill = Method.code)) +
  geom_col(color = "black") +
  geom_label(aes(label = perc),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +scale_fill_jama()+labs(fill="")+
  coord_polar(theta = "y")+theme_void()+facet_wrap(~domain, nrow = 4, labeller=labeller(domain=labels))+theme(legend.position="bottom")
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
#combine plots
#plot percentages of different forecasting method choices by domain for tournament 1 and tournament 2 (i.e., put them together)
### Fig. S3 in the manuscript

perc_per_domain_both<-cowplot::plot_grid(perc.by.domain.phase1,perc.by.domain.phase2,labels=c("1st T.","2nd T."), label_size = 10,
  align = "v")

perc_per_domain_both
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

####################################

####################################
#examine analyses of forecasting method choice on accuracy
####################################
#recorder levels of the domains (to use later)
dat_long$domain <- factor(dat_long$domain,      # Reordering group factor levels
                         levels = c("egend","easian","eafric",
                                    "igend","iasian","iafric",
                                    "posaffect","negaffect","lifesat",
                                    "polar","ideoldem","ideolrep"))
#Tournament 1: run models
model.phase1.across.domains<-  lmer(log(MASE1_w1)~Method.code+domain+(1|ResponseId), data=subset(phase1,isExpert.factor=="Academic"))
car::Anova(model.phase1.across.domains,type="III", test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                    F Df Df.res                Pr(>F)    
## (Intercept) 218.1304  1 296.68 < 0.00000000000000022 ***
## Method.code   5.4702  2 149.10              0.005099 ** 
## domain       40.5746 11 301.01 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
partR2(model.phase1.across.domains, partvars = 
         c("Method.code","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.4949 NA       NA       1     14 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)       R2     CI_lower CI_upper nboot ndf
##  Model              0.4949 NA       NA       1     14 
##  Method.code        0.0958 NA       NA       1     12 
##  domain             0.4735 NA       NA       1      4 
##  Method.code+domain 0.4949 NA       NA       1      1
data.phase1.MASE.total<-as.data.frame(emmeans(model.phase1.across.domains,pairwise ~Method.code, type = "response", adjust = "none")$emmeans)

#next run Tournament 2 models
data.phase2.model.across.domains<-  lmer(log(MASE1_w2)~Method.code * domain+(1|ResponseId), data=dat_phase2)
car::Anova(data.phase2.model.across.domains,type="III", test.statistic="F") #sig interaction!
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w2)
##                          F Df Df.res                Pr(>F)    
## (Intercept)        23.4104  1 485.14           0.000001761 ***
## Method.code         0.7041  2 502.06              0.495017    
## domain             15.4094 11 435.44 < 0.00000000000000022 ***
## Method.code:domain  2.1050 22 455.80              0.002607 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase2.MASE.total<-as.data.frame(emmeans(data.phase2.model.across.domains,pairwise ~Method.code, type = "response", adjust = "none")$emmeans)
## NOTE: Results may be misleading due to involvement in interactions
partR2(data.phase2.model.across.domains, partvars = 
         c("Method.code","domain"))
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3534 NA       NA       1     36 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)       R2     CI_lower CI_upper nboot ndf
##  Model              0.3534 NA       NA       1     36 
##  Method.code        0.0000 NA       NA       1     48 
##  domain             0.0000 NA       NA       1     48 
##  Method.code+domain 0.0000 NA       NA       1     48
## we test if forecasts that considered historical data as part of the forecast modelling were more accurate than models that did not - MAIN TEXT
#i.e., EXAMINE ONLY ACADEMICS, USING CONTRAST OF THEORY vs. DATA.HYBRID
### Tournament 1
phase1_exp$method.contrast<-ifelse(phase1_exp$Method.code=='Intuition/Theory',0,1)
model.phase1.contrast<-  lmer(log(MASE1_w1)~method.contrast+domain+(1|ResponseId), data=phase1_exp)
## boundary (singular) fit: see help('isSingular')
car::Anova(model.phase1.contrast,type="III", test.statistic="F") #sig domain effect,  and sig interaction
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                       F Df Df.res                Pr(>F)    
## (Intercept)     197.300  1 340.80 < 0.00000000000000022 ***
## method.contrast  20.379  1  56.29            0.00003289 ***
## domain           34.739 11 326.38 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase1.contrast, digits=4) #get effect size for the overall model
Observations 359
Dependent variable log(MASE1_w1)
Type Mixed effects linear regression
AIC 596.3441
BIC 654.5939
Pseudo-R² (fixed effects) 0.5284
Pseudo-R² (total) 0.5284
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.6361 0.1160 14.1086 346.0000 0.0000
method.contrast -0.2547 0.0555 -4.5914 346.0000 0.0000
domaineasian -0.2985 0.1492 -2.0011 346.0000 0.0462
domainegend -1.4378 0.1569 -9.1640 346.0000 0.0000
domainiafric 0.2483 0.1503 1.6520 346.0000 0.0994
domainiasian -0.7913 0.1445 -5.4775 346.0000 0.0000
domainideoldem -0.3982 0.1407 -2.8296 346.0000 0.0049
domainideolrep 0.1563 0.1407 1.1110 346.0000 0.2673
domainigend -1.1906 0.1550 -7.6800 346.0000 0.0000
domainlifesat -0.0832 0.1293 -0.6438 346.0000 0.5201
domainnegaffect -0.1587 0.1466 -1.0822 346.0000 0.2799
domainpolar -0.8897 0.1430 -6.2200 346.0000 0.0000
domainposaffect -1.2771 0.1466 -8.7105 346.0000 0.0000
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.0000
Residual 0.5142
Grouping Variables
Group # groups ICC
ResponseId 86 0.0000
partR2(model.phase1.contrast, partvars = 
         c("method.contrast","domain"))
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.5284 NA       NA       1     13 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)           R2     CI_lower CI_upper nboot ndf
##  Model                  0.5284 NA       NA       1     13 
##  method.contrast        0.0961 NA       NA       1     12 
##  domain                 0.5045 NA       NA       1      2 
##  method.contrast+domain 0.5284 NA       NA       1      1
### Tournament 2
dat_phase2$method.contrast<-ifelse(dat_phase2$Method.code=='Intuition/Theory',0,1)
model.phase2.contrast<-  lmer(log(MASE1_w2)~method.contrast+domain+(1|ResponseId), data=dat_phase2)
car::Anova(model.phase2.contrast,type="III", test.statistic="F") #sig domain effect,  and sig interaction
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w2)
##                       F Df Df.res                Pr(>F)    
## (Intercept)     56.7385  1 492.87    0.0000000000002396 ***
## method.contrast  8.1158  1 159.11              0.004969 ** 
## domain          27.1342 11 470.96 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase2.contrast, digits=4) #get effect size for the overall model
Observations 546
Dependent variable log(MASE1_w2)
Type Mixed effects linear regression
AIC 1184.0444
BIC 1248.5837
Pseudo-R² (fixed effects) 0.3136
Pseudo-R² (total) 0.5062
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 0.9521 0.1262 7.5467 471.8974 0.0000
method.contrast -0.2607 0.0910 -2.8640 112.5214 0.0050
domainideolrep 0.5944 0.1408 4.2227 441.2213 0.0000
domaineafric -0.2781 0.1511 -1.8406 403.2493 0.0664
domainnegaffect 0.1613 0.1382 1.1675 441.3406 0.2436
domainlifesat -0.0042 0.1289 -0.0327 480.9780 0.9739
domaineasian -0.3948 0.1469 -2.6879 419.5075 0.0075
domainideoldem 0.1187 0.1408 0.8435 441.2213 0.3994
domainiasian -0.7867 0.1418 -5.5474 413.8289 0.0000
domainpolar -0.0056 0.1412 -0.0398 443.7400 0.9682
domainigend -0.8972 0.1460 -6.1453 409.6400 0.0000
domainposaffect -1.0298 0.1382 -7.4517 441.3406 0.0000
domainegend -0.6361 0.1485 -4.2846 424.6643 0.0000
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.3861
Residual 0.6184
Grouping Variables
Group # groups ICC
ResponseId 120 0.2805
partR2(model.phase2.contrast, partvars = 
         c("method.contrast","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3136 NA       NA       1     13 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)           R2     CI_lower CI_upper nboot ndf
##  Model                  0.3136 NA       NA       1     13 
##  method.contrast        0.0838 NA       NA       1     12 
##  domain                 0.2958 NA       NA       1      2 
##  method.contrast+domain 0.3136 NA       NA       1      1
## Test if model comparison effects were qualified by significant model type X domain interaction
### Tournament 1
model.phase1.contrast.by.domain<-  lmer(log(MASE1_w1)~method.contrast*domain+(1|ResponseId), data=phase1_exp)
car::Anova(model.phase1.contrast.by.domain, type=3, test.statistic="F")
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w1)
##                              F Df Df.res                Pr(>F)    
## (Intercept)            90.5384  1 327.59 < 0.00000000000000022 ***
## method.contrast         0.7111  1 326.96                0.3997    
## domain                 20.9351 11 281.09 < 0.00000000000000022 ***
## method.contrast:domain  4.5660 11 278.67            0.00000223 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase1.MASE.method<-as.data.frame(emmeans(model.phase1.contrast.by.domain,pairwise ~method.contrast|domain, type="response")$contrasts)

#get FDR correction across all pairwise tests
data.phase1.MASE.method$Hochberg <-p.adjust(data.phase1.MASE.method$p.value,
               method = "hochberg")
data.phase1.MASE.method
## domain = eafric:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.2012189 0.2611523 326.96    1   0.843
##  p.value  Hochberg
##   0.3997 0.5153505
## 
## domain = easian:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 2.0076070 0.3893891 334.97    1   3.593
##  p.value  Hochberg
##   0.0004 0.0041306
## 
## domain = egend:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.7286746 0.3721548 329.76    1   2.542
##  p.value  Hochberg
##   0.0115 0.1031716
## 
## domain = iafric:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.1422625 0.2333050 334.32    1   0.651
##  p.value  Hochberg
##   0.5154 0.5153505
## 
## domain = iasian:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 2.2374282 0.4136382 333.97    1   4.356
##  p.value  Hochberg
##   <.0001 0.0002116
## 
## domain = ideoldem:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.4395807 0.2554351 333.82    1   2.053
##  p.value  Hochberg
##   0.0408 0.2856787
## 
## domain = ideolrep:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.2560434 0.2228688 333.82    1   1.285
##  p.value  Hochberg
##   0.1998 0.5153505
## 
## domain = igend:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 2.1055169 0.4598916 328.41    1   3.409
##  p.value  Hochberg
##   0.0007 0.0073371
## 
## domain = lifesat:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.4018032 0.1974374 298.45    1   2.398
##  p.value  Hochberg
##   0.0171 0.1367594
## 
## domain = negaffect:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 0.7677547 0.1434952 333.85    1  -1.414
##  p.value  Hochberg
##   0.1583 0.5153505
## 
## domain = polar:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 0.8708194 0.1570010 331.21    1  -0.767
##  p.value  Hochberg
##   0.4435 0.5153505
## 
## domain = posaffect:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 0.7720119 0.1442909 333.85    1  -1.384
##  p.value  Hochberg
##   0.1671 0.5153505
## 
## Degrees-of-freedom method: kenward-roger 
## Tests are performed on the log scale
partR2(model.phase1.contrast.by.domain, partvars = 
         c("method.contrast","domain:method.contrast","domain"))
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2    CI_lower CI_upper nboot ndf
##  0.481 NA       NA       1     24 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                                  R2     CI_lower CI_upper nboot
##  Model                                         0.4810 NA       NA       1    
##  method.contrast                               0.0000 NA       NA       1    
##  domain:method.contrast                        0.0452 NA       NA       1    
##  domain                                        0.1821 NA       NA       1    
##  method.contrast+domain:method.contrast        0.1245 NA       NA       1    
##  method.contrast+domain                        0.1821 NA       NA       1    
##  domain:method.contrast+domain                 0.4613 NA       NA       1    
##  method.contrast+domain:method.contrast+domain 0.4810 NA       NA       1    
##  ndf
##  24 
##  24 
##  13 
##  13 
##  12 
##  13 
##   2 
##   1
write.csv(data.phase1.MASE.method,"contrast1.csv") #for the table in supplement

### Tournament 2
model.phase2.contrast.by.domain<-  lmer(log(MASE1_w2)~method.contrast*domain+(1|ResponseId), data=dat_phase2)
car::Anova(model.phase2.contrast.by.domain, type=3, test.statistic="F")
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: log(MASE1_w2)
##                              F Df Df.res                Pr(>F)    
## (Intercept)            19.5961  1 518.59            0.00001167 ***
## method.contrast         0.7432  1 520.24             0.3890437    
## domain                 14.8472 11 466.93 < 0.00000000000000022 ***
## method.contrast:domain  3.3808 11 462.08             0.0001621 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.phase2.MASE.method<-as.data.frame(emmeans(model.phase2.contrast.by.domain,pairwise ~method.contrast|domain, type="response")$contrasts)

#get FDR correction across all pairwise tests
data.phase2.MASE.method$Hochberg <-p.adjust(data.phase2.MASE.method$p.value,
               method = "hochberg")
data.phase2.MASE.method
## domain = iafric:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.2354661 0.3030332 520.24    1   0.862
##  p.value  Hochberg
##   0.3890 0.6851539
## 
## domain = ideolrep:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.1672861 0.2351012 521.67    1   0.768
##  p.value  Hochberg
##   0.4428 0.6851539
## 
## domain = eafric:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.1066033 0.2763163 510.62    1   0.406
##  p.value  Hochberg
##   0.6852 0.6851539
## 
## domain = negaffect:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 0.7902092 0.1545712 519.24    1  -1.204
##  p.value  Hochberg
##   0.2292 0.6851539
## 
## domain = lifesat:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 2.0674427 0.3220152 478.22    1   4.663
##  p.value  Hochberg
##   <.0001 0.0000485
## 
## domain = easian:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.1252688 0.2518579 521.99    1   0.527
##  p.value  Hochberg
##   0.5982 0.6851539
## 
## domain = ideoldem:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.6112793 0.3245252 521.67    1   2.368
##  p.value  Hochberg
##   0.0182 0.1822560
## 
## domain = iasian:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.2762142 0.2730107 520.24    1   1.140
##  p.value  Hochberg
##   0.2548 0.6851539
## 
## domain = polar:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.0859360 0.2139983 517.81    1   0.418
##  p.value  Hochberg
##   0.6759 0.6851539
## 
## domain = igend:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 1.1446020 0.2593657 515.45    1   0.596
##  p.value  Hochberg
##   0.5514 0.6851539
## 
## domain = posaffect:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 0.7084102 0.1385707 519.24    1  -1.762
##  p.value  Hochberg
##   0.0786 0.6851539
## 
## domain = egend:
##  contrast                                ratio        SE     df null t.ratio
##  method.contrast0 / method.contrast1 2.0887649 0.4632207 519.14    1   3.321
##  p.value  Hochberg
##   0.0010 0.0105471
## 
## Degrees-of-freedom method: kenward-roger 
## Tests are performed on the log scale
partR2(model.phase2.contrast.by.domain, partvars = 
         c("method.contrast","domain:method.contrast","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3376 NA       NA       1     24 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                                  R2     CI_lower CI_upper nboot
##  Model                                         0.3376 NA       NA       1    
##  method.contrast                               0.0000 NA       NA       1    
##  domain:method.contrast                        0.0280 NA       NA       1    
##  domain                                        0.1433 NA       NA       1    
##  method.contrast+domain:method.contrast        0.1107 NA       NA       1    
##  method.contrast+domain                        0.1433 NA       NA       1    
##  domain:method.contrast+domain                 0.3200 NA       NA       1    
##  method.contrast+domain:method.contrast+domain 0.3376 NA       NA       1    
##  ndf
##  24 
##  24 
##  13 
##  13 
##  12 
##  13 
##   2 
##   1
write.csv(data.phase2.MASE.method,"contrast2.csv") #for table in the supplement

## supplementary model with all three forecasting methods*domain interaction is below. We use it to get estimates for modelling by domain by method
### Tournament 1
model.phase1<-  lmer(log(MASE1_w1)~domain*Method.code+(1|ResponseId), data=subset(phase1,isExpert.factor=="Academic"))
car::Anova(model.phase1,type="III") #sig interaction!
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(MASE1_w1)
##                       Chisq Df            Pr(>Chisq)    
## (Intercept)        172.8090  1 < 0.00000000000000022 ***
## domain             341.6542 11 < 0.00000000000000022 ***
## Method.code          0.7275  2                0.6951    
## domain:Method.code  56.6044 22            0.00007001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase1, digits=4)
Observations 359
Dependent variable log(MASE1_w1)
Type Mixed effects linear regression
AIC 602.3151
BIC 749.8813
Pseudo-R² (fixed effects) 0.5029
Pseudo-R² (total) 0.7461
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.7358 0.1320 13.1457 278.1005 0.0000
domainideolrep -0.1592 0.1504 -1.0585 193.6055 0.2911
domaineafric -0.2724 0.1565 -1.7409 173.0634 0.0835
domainnegaffect -0.2706 0.1563 -1.7319 181.3403 0.0850
domainlifesat -0.6273 0.1533 -4.0927 228.9528 0.0001
domaineasian -0.7421 0.1643 -4.5161 176.2974 0.0000
domainideoldem -0.7593 0.1504 -5.0482 193.6055 0.0000
domainiasian -1.3199 0.1515 -8.7134 174.7120 0.0000
domainpolar -1.0437 0.1674 -6.2365 204.6899 0.0000
domainigend -1.6489 0.1568 -10.5155 175.6936 0.0000
domainposaffect -1.4162 0.1563 -9.0632 181.3403 0.0000
domainegend -1.9250 0.1648 -11.6831 179.4459 0.0000
Method.codeHybrid 0.0556 0.5507 0.1010 251.4557 0.9196
Method.codeIntuition/Theory 0.1771 0.2076 0.8529 322.8783 0.3943
domainideolrep:Method.codeHybrid 0.3725 0.6929 0.5376 171.9116 0.5916
domaineafric:Method.codeHybrid -0.2423 0.6091 -0.3978 168.2225 0.6913
domainnegaffect:Method.codeHybrid -0.1182 0.7569 -0.1561 300.0468 0.8760
domainlifesat:Method.codeHybrid 0.2715 0.5787 0.4691 258.9721 0.6394
domaineasian:Method.codeHybrid -0.4810 0.5673 -0.8480 224.9875 0.3974
domainideoldem:Method.codeHybrid 0.3028 0.6929 0.4371 171.9116 0.6626
domainiasian:Method.codeHybrid 0.2342 0.6078 0.3853 168.3019 0.7005
domainpolar:Method.codeHybrid 0.7547 0.7287 1.0358 322.9623 0.3011
domainigend:Method.codeHybrid -0.5392 0.6091 -0.8852 168.3925 0.3773
domainposaffect:Method.codeHybrid 0.2784 0.7569 0.3678 300.0468 0.7133
domainegend:Method.codeHybrid -0.0101 0.6112 -0.0166 168.7014 0.9868
domainideolrep:Method.codeIntuition/Theory 0.0892 0.2451 0.3641 203.9608 0.7162
domaineafric:Method.codeIntuition/Theory 0.0508 0.2673 0.1901 182.3547 0.8494
domainnegaffect:Method.codeIntuition/Theory -0.4011 0.2539 -1.5800 207.0697 0.1156
domainlifesat:Method.codeIntuition/Theory 0.2954 0.2365 1.2493 248.9589 0.2127
domaineasian:Method.codeIntuition/Theory 0.5121 0.2565 1.9966 198.3860 0.0472
domainideoldem:Method.codeIntuition/Theory 0.2190 0.2451 0.8935 203.9608 0.3726
domainiasian:Method.codeIntuition/Theory 0.7005 0.2422 2.8921 188.1488 0.0043
domainpolar:Method.codeIntuition/Theory -0.2216 0.2470 -0.8973 207.6806 0.3706
domainigend:Method.codeIntuition/Theory 0.5807 0.2666 2.1779 180.4570 0.0307
domainposaffect:Method.codeIntuition/Theory -0.3708 0.2539 -1.4606 207.0697 0.1456
domainegend:Method.codeIntuition/Theory 0.4232 0.2695 1.5705 192.5191 0.1179
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.4074
Residual 0.4162
Grouping Variables
Group # groups ICC
ResponseId 86 0.4892
emmeans(model.phase1,trt.vs.ctrl ~Method.code|domain, adjust = "none") #lay vs. rest
## $emmeans
## domain = iafric:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.7358 0.132 306   1.4753   1.9962
##  Hybrid            1.7914 0.541 284   0.7258   2.8570
##  Intuition/Theory  1.9128 0.164 323   1.5909   2.2348
## 
## domain = ideolrep:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.5766 0.121 279   1.3374   1.8157
##  Hybrid            2.0047 0.412 158   1.1913   2.8180
##  Intuition/Theory  1.8429 0.139 320   1.5692   2.1165
## 
## domain = eafric:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.4634 0.136 311   1.1958   1.7309
##  Hybrid            1.2767 0.541 284   0.2111   2.3423
##  Intuition/Theory  1.6912 0.179 315   1.3395   2.0430
## 
## domain = negaffect:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.4651 0.130 314   1.2091   1.7212
##  Hybrid            1.4026 0.513 323   0.3929   2.4124
##  Intuition/Theory  1.2411 0.146 319   0.9547   1.5275
## 
## domain = lifesat:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.1084 0.121 273   0.8693   1.3476
##  Hybrid            1.4355 0.152 289   1.1366   1.7344
##  Intuition/Theory  1.5809 0.107 242   1.3706   1.7912
## 
## domain = easian:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.9937 0.144 319   0.7105   1.2769
##  Hybrid            0.5683 0.368 310  -0.1549   1.2916
##  Intuition/Theory  1.6829 0.144 320   1.3987   1.9671
## 
## domain = ideoldem:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.9765 0.121 279   0.7374   1.2156
##  Hybrid            1.3350 0.412 158   0.5216   2.1483
##  Intuition/Theory  1.3725 0.139 320   1.0989   1.6462
## 
## domain = iasian:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.4158 0.130 298   0.1608   0.6709
##  Hybrid            0.7057 0.541 284  -0.3599   1.7712
##  Intuition/Theory  1.2934 0.140 316   1.0174   1.5695
## 
## domain = polar:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.6921 0.141 309   0.4142   0.9699
##  Hybrid            1.5024 0.461 280   0.5945   2.4104
##  Intuition/Theory  0.6475 0.123 295   0.4054   0.8896
## 
## domain = igend:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.0869 0.137 309  -0.1817   0.3555
##  Hybrid           -0.3967 0.541 284  -1.4622   0.6689
##  Intuition/Theory  0.8447 0.180 316   0.4914   1.1980
## 
## domain = posaffect:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.3196 0.130 314   0.0636   0.5756
##  Hybrid            0.6536 0.513 323  -0.3561   1.6634
##  Intuition/Theory  0.1259 0.146 319  -0.1605   0.4123
## 
## domain = egend:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven      -0.1893 0.144 318  -0.4733   0.0948
##  Hybrid           -0.1438 0.541 284  -1.2093   0.9218
##  Intuition/Theory  0.4110 0.170 320   0.0765   0.7455
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
## domain = iafric:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.0556 0.554 294   0.100  0.9200
##  (Intuition/Theory) - (Data-Driven)   0.1771 0.208 323   0.850  0.3958
## 
## domain = ideolrep:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.4281 0.429 166   0.997  0.3202
##  (Intuition/Theory) - (Data-Driven)   0.2663 0.181 321   1.468  0.1431
## 
## domain = eafric:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)              -0.1867 0.554 295  -0.337  0.7365
##  (Intuition/Theory) - (Data-Driven)   0.2279 0.222 318   1.025  0.3060
## 
## domain = negaffect:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)              -0.0625 0.529 323  -0.118  0.9060
##  (Intuition/Theory) - (Data-Driven)  -0.2240 0.192 323  -1.167  0.2442
## 
## domain = lifesat:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.3271 0.189 314   1.730  0.0846
##  (Intuition/Theory) - (Data-Driven)   0.4725 0.160 280   2.951  0.0034
## 
## domain = easian:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)              -0.4254 0.388 320  -1.095  0.2742
##  (Intuition/Theory) - (Data-Driven)   0.6892 0.202 323   3.417  0.0007
## 
## domain = ideoldem:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.3585 0.429 166   0.835  0.4050
##  (Intuition/Theory) - (Data-Driven)   0.3961 0.181 321   2.183  0.0298
## 
## domain = iasian:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.2898 0.553 293   0.524  0.6006
##  (Intuition/Theory) - (Data-Driven)   0.8776 0.189 319   4.646  <.0001
## 
## domain = polar:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.8104 0.476 271   1.704  0.0895
##  (Intuition/Theory) - (Data-Driven)  -0.0446 0.185 316  -0.241  0.8101
## 
## domain = igend:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)              -0.4836 0.556 292  -0.869  0.3853
##  (Intuition/Theory) - (Data-Driven)   0.7578 0.223 319   3.394  0.0008
## 
## domain = posaffect:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.3340 0.529 323   0.631  0.5282
##  (Intuition/Theory) - (Data-Driven)  -0.1937 0.192 323  -1.009  0.3139
## 
## domain = egend:
##  contrast                           estimate    SE  df t.ratio p.value
##  Hybrid - (Data-Driven)               0.0455 0.558 293   0.082  0.9351
##  (Intuition/Theory) - (Data-Driven)   0.6003 0.221 320   2.719  0.0069
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale.
emmeans(model.phase1,pairwise ~Method.code|domain, adjust = "none")
## $emmeans
## domain = iafric:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.7358 0.132 306   1.4753   1.9962
##  Hybrid            1.7914 0.541 284   0.7258   2.8570
##  Intuition/Theory  1.9128 0.164 323   1.5909   2.2348
## 
## domain = ideolrep:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.5766 0.121 279   1.3374   1.8157
##  Hybrid            2.0047 0.412 158   1.1913   2.8180
##  Intuition/Theory  1.8429 0.139 320   1.5692   2.1165
## 
## domain = eafric:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.4634 0.136 311   1.1958   1.7309
##  Hybrid            1.2767 0.541 284   0.2111   2.3423
##  Intuition/Theory  1.6912 0.179 315   1.3395   2.0430
## 
## domain = negaffect:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.4651 0.130 314   1.2091   1.7212
##  Hybrid            1.4026 0.513 323   0.3929   2.4124
##  Intuition/Theory  1.2411 0.146 319   0.9547   1.5275
## 
## domain = lifesat:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       1.1084 0.121 273   0.8693   1.3476
##  Hybrid            1.4355 0.152 289   1.1366   1.7344
##  Intuition/Theory  1.5809 0.107 242   1.3706   1.7912
## 
## domain = easian:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.9937 0.144 319   0.7105   1.2769
##  Hybrid            0.5683 0.368 310  -0.1549   1.2916
##  Intuition/Theory  1.6829 0.144 320   1.3987   1.9671
## 
## domain = ideoldem:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.9765 0.121 279   0.7374   1.2156
##  Hybrid            1.3350 0.412 158   0.5216   2.1483
##  Intuition/Theory  1.3725 0.139 320   1.0989   1.6462
## 
## domain = iasian:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.4158 0.130 298   0.1608   0.6709
##  Hybrid            0.7057 0.541 284  -0.3599   1.7712
##  Intuition/Theory  1.2934 0.140 316   1.0174   1.5695
## 
## domain = polar:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.6921 0.141 309   0.4142   0.9699
##  Hybrid            1.5024 0.461 280   0.5945   2.4104
##  Intuition/Theory  0.6475 0.123 295   0.4054   0.8896
## 
## domain = igend:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.0869 0.137 309  -0.1817   0.3555
##  Hybrid           -0.3967 0.541 284  -1.4622   0.6689
##  Intuition/Theory  0.8447 0.180 316   0.4914   1.1980
## 
## domain = posaffect:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven       0.3196 0.130 314   0.0636   0.5756
##  Hybrid            0.6536 0.513 323  -0.3561   1.6634
##  Intuition/Theory  0.1259 0.146 319  -0.1605   0.4123
## 
## domain = egend:
##  Method.code       emmean    SE  df lower.CL upper.CL
##  Data-Driven      -0.1893 0.144 318  -0.4733   0.0948
##  Hybrid           -0.1438 0.541 284  -1.2093   0.9218
##  Intuition/Theory  0.4110 0.170 320   0.0765   0.7455
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
## domain = iafric:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.0556 0.554 294  -0.100  0.9200
##  (Data-Driven) - (Intuition/Theory)  -0.1771 0.208 323  -0.850  0.3958
##  Hybrid - (Intuition/Theory)         -0.1214 0.565 291  -0.215  0.8301
## 
## domain = ideolrep:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.4281 0.429 166  -0.997  0.3202
##  (Data-Driven) - (Intuition/Theory)  -0.2663 0.181 321  -1.468  0.1431
##  Hybrid - (Intuition/Theory)          0.1618 0.435 174   0.372  0.7101
## 
## domain = eafric:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid               0.1867 0.554 295   0.337  0.7365
##  (Data-Driven) - (Intuition/Theory)  -0.2279 0.222 318  -1.025  0.3060
##  Hybrid - (Intuition/Theory)         -0.4145 0.570 294  -0.727  0.4676
## 
## domain = negaffect:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid               0.0625 0.529 323   0.118  0.9060
##  (Data-Driven) - (Intuition/Theory)   0.2240 0.192 323   1.167  0.2442
##  Hybrid - (Intuition/Theory)          0.1615 0.530 323   0.305  0.7609
## 
## domain = lifesat:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.3271 0.189 314  -1.730  0.0846
##  (Data-Driven) - (Intuition/Theory)  -0.4725 0.160 280  -2.951  0.0034
##  Hybrid - (Intuition/Theory)         -0.1454 0.182 301  -0.799  0.4252
## 
## domain = easian:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid               0.4254 0.388 320   1.095  0.2742
##  (Data-Driven) - (Intuition/Theory)  -0.6892 0.202 323  -3.417  0.0007
##  Hybrid - (Intuition/Theory)         -1.1146 0.394 313  -2.826  0.0050
## 
## domain = ideoldem:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.3585 0.429 166  -0.835  0.4050
##  (Data-Driven) - (Intuition/Theory)  -0.3961 0.181 321  -2.183  0.0298
##  Hybrid - (Intuition/Theory)         -0.0376 0.435 174  -0.086  0.9312
## 
## domain = iasian:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.2898 0.553 293  -0.524  0.6006
##  (Data-Driven) - (Intuition/Theory)  -0.8776 0.189 319  -4.646  <.0001
##  Hybrid - (Intuition/Theory)         -0.5878 0.559 287  -1.051  0.2940
## 
## domain = polar:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.8104 0.476 271  -1.704  0.0895
##  (Data-Driven) - (Intuition/Theory)   0.0446 0.185 316   0.241  0.8101
##  Hybrid - (Intuition/Theory)          0.8549 0.476 284   1.795  0.0738
## 
## domain = igend:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid               0.4836 0.556 292   0.869  0.3853
##  (Data-Driven) - (Intuition/Theory)  -0.7578 0.223 319  -3.394  0.0008
##  Hybrid - (Intuition/Theory)         -1.2413 0.570 294  -2.177  0.0303
## 
## domain = posaffect:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.3340 0.529 323  -0.631  0.5282
##  (Data-Driven) - (Intuition/Theory)   0.1937 0.192 323   1.009  0.3139
##  Hybrid - (Intuition/Theory)          0.5277 0.530 323   0.995  0.3204
## 
## domain = egend:
##  contrast                           estimate    SE  df t.ratio p.value
##  (Data-Driven) - Hybrid              -0.0455 0.558 293  -0.082  0.9351
##  (Data-Driven) - (Intuition/Theory)  -0.6003 0.221 320  -2.719  0.0069
##  Hybrid - (Intuition/Theory)         -0.5548 0.567 292  -0.978  0.3289
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale.
partR2(model.phase1, partvars = 
         c("domain","domain:Method.code","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.5029 NA       NA       1     36 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                     R2     CI_lower CI_upper nboot ndf
##  Model                            0.5029 NA       NA       1     36 
##  domain                           0.1201 NA       NA       1     48 
##  domain:Method.code               0.1239 NA       NA       1     15 
##  domain                           0.1201 NA       NA       1     48 
##  domain+domain:Method.code        0.4839 NA       NA       1      4 
##  domain+domain                    0.1201 NA       NA       1     48 
##  domain:Method.code+domain        0.4839 NA       NA       1      4 
##  domain+domain:Method.code+domain 0.4839 NA       NA       1      4
data.phase1.MASE<-as.data.frame(emmeans(model.phase1,pairwise ~Method.code|domain, type = "response", adjust = "none")$emmeans) 

### Tournament 2
data.phase2.model<-  lmer(log(MASE1_w2)~domain*Method.code+(1|ResponseId), data=dat_phase2)
car::Anova(data.phase2.model,type="III") #sig interaction!
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(MASE1_w2)
##                       Chisq Df            Pr(>Chisq)    
## (Intercept)         23.4829  1            0.00000126 ***
## domain             169.8096 11 < 0.00000000000000022 ***
## Method.code          1.4138  2              0.493167    
## domain:Method.code  46.4359 22              0.001737 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
partR2(data.phase2.model, partvars = 
         c("domain","domain:Method.code","domain"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3534 NA       NA       1     36 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                     R2     CI_lower CI_upper nboot ndf
##  Model                            0.3534 NA       NA       1     36 
##  domain                           0.0000 NA       NA       1     48 
##  domain:Method.code               0.0178 NA       NA       1     15 
##  domain                           0.0000 NA       NA       1     48 
##  domain+domain:Method.code        0.2489 NA       NA       1      4 
##  domain+domain                    0.0000 NA       NA       1     48 
##  domain:Method.code+domain        0.2489 NA       NA       1      4 
##  domain+domain:Method.code+domain 0.2489 NA       NA       1      4
data.phase2.MASE<-as.data.frame(emmeans(data.phase2.model, pairwise~Method.code|domain, adjust = "none", type = "response")$emmeans) #backtransformed to the original scale

## Supplementary analyses to examine if data-free forecasts of social scientists were not better than lay estimates, in Tournament 1
###EXAMINE ONLY ACADEMICS, USING CONTRAST OF THEORY vs. DATA.HYBRID
phase1$method.contrast.layppl[phase1$Method.code=='Intuition/Theory']<-"Sci data-free"
phase1$method.contrast.layppl[phase1$Method.code=='Lay People']<-"lay people"
phase1$method.contrast.layppl[phase1$Method.code=='Data-Driven']<-"Sci data-incl."
phase1$method.contrast.layppl[phase1$Method.code=='Hybrid']<-"Sci data-incl."

phase1$MASE1_w1_log<-log(phase1$MASE1_w1) #this this to get emmeans-based effect size Cohen's d for pairwise comparisons

model.phase1.contrast.lay<-  lmer(MASE1_w1_log~method.contrast.layppl+(1|ResponseId), data=phase1)
car::Anova(model.phase1.contrast.lay,type="III") #sig domain effect,  and sig interaction
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: MASE1_w1_log
##                            Chisq Df            Pr(>Chisq)    
## (Intercept)            2381.9755  1 < 0.00000000000000022 ***
## method.contrast.layppl    9.9256  2              0.006993 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(model.phase1.contrast.lay,specs = trt.vs.ctrl ~method.contrast.layppl, adjust = "fdr",type="response" ) 
## $emmeans
##  method.contrast.layppl emmean     SE  df lower.CL upper.CL
##  lay people               1.32 0.0270 899     1.26     1.37
##  Sci data-free            1.23 0.1030 560     1.02     1.43
##  Sci data-incl.           1.01 0.0957 449     0.82     1.20
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                      estimate     SE  df t.ratio p.value
##  (Sci data-free) - lay people   -0.0922 0.1060 577  -0.869  0.3854
##  (Sci data-incl.) - lay people  -0.3090 0.0994 470  -3.108  0.0040
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: fdr method for 2 tests
#significant difference between academics who used data and lay people, but not between academics who did not use data and lay people
( EMM = emmeans(model.phase1.contrast.lay, "method.contrast.layppl") )
##  method.contrast.layppl emmean     SE  df lower.CL upper.CL
##  lay people               1.32 0.0270 899     1.26     1.37
##  Sci data-free            1.23 0.1030 560     1.02     1.43
##  Sci data-incl.           1.01 0.0957 449     0.82     1.20
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95
pairs(EMM)
##  contrast                           estimate     SE  df t.ratio p.value
##  lay people - (Sci data-free)         0.0922 0.1060 577   0.869  0.6603
##  lay people - (Sci data-incl.)        0.3090 0.0994 470   3.108  0.0057
##  (Sci data-free) - (Sci data-incl.)   0.2168 0.1360 627   1.590  0.2506
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 3 estimates
eff_size(EMM,sigma = sigma(model.phase1.contrast.lay), edf =df.residual(model.phase1.contrast.lay) ) #using the smallest DF among the three
##  contrast                           effect.size    SE  df lower.CL upper.CL
##  lay people - (Sci data-free)             0.117 0.134 560  -0.1474    0.381
##  lay people - (Sci data-incl.)            0.391 0.126 449   0.1436    0.639
##  (Sci data-free) - (Sci data-incl.)       0.274 0.173 449  -0.0648    0.614
## 
## sigma used for effect sizes: 0.7897 
## Degrees-of-freedom method: inherited from kenward-roger when re-gridding 
## Confidence level used: 0.95
############SOME EXTRA FIGURES: NOT USED IN THE MANUSCRIPT OR SUPPLEMENT##################
#########BEGINNING###################
###Tournament 1
#arrange in descending order based on MASE w2 of academics
data.phase1.MASE$domain<-factor(data.phase1.MASE$domain,levels=c("ideolrep","negaffect","ideoldem","polar","iafric","lifesat","eafric","easian","egend","iasian","igend","posaffect"))

data.phase1.MASE %>% 
 ggplot(aes(x = domain, y = response, colour = Method.code, fill=Method.code))+
 geom_pointrange(aes(ymin=lower.CL, ymax=upper.CL), position=pd)+  theme_minimal(base_size = 14) +geom_hline(yintercept =1, linetype='dashed', color='red', 14)+coord_flip()+
theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+  scale_x_discrete(labels=labeling, name="")+
  labs(colour = "",fill="", x="",y="MASE (M +/- 95%CI)") 
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

data.phase1.MASE.total %>% 
 ggplot(aes(x = Method.code, y = response, colour = Method.code, fill=Method.code))+
 geom_pointrange(aes(ymin=lower.CL, ymax=upper.CL), position=pd)+  theme_minimal(base_size = 14) +geom_hline(yintercept =1, linetype='dashed', color='red', 14)+coord_flip()+
theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+  scale_x_discrete(labels=labeling, name="")+
  labs(colour = "",fill="", x="",y="MASE (M +/- 95%CI)") 
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

### Tournament 2
data.phase2.MASE %>% 
 ggplot(aes(x = domain, y = response, colour = Method.code, fill=Method.code))+
 geom_pointrange(aes(ymin=lower.CL, ymax=upper.CL), position=pd)+  theme_minimal(base_size = 14) +geom_hline(yintercept =1, linetype='dashed', color='red', 14)+coord_flip()+
theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+  scale_x_discrete(labels=labeling, name="")+
  labs(colour = "",fill="", x="",y="MASE (M +/- 95%CI)") 
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

data.phase2.MASE.total %>% 
 ggplot(aes(x = Method.code, y = response, colour = Method.code, fill=Method.code))+
 geom_pointrange(aes(ymin=lower.CL, ymax=upper.CL), position=pd)+  theme_minimal(base_size = 14) +geom_hline(yintercept =1, linetype='dashed', color='red', 14)+coord_flip()+
theme(legend.position="bottom") +scale_color_d3()+scale_fill_d3()+  scale_x_discrete(labels=labeling, name="")+
  labs(colour = "",fill="", x="",y="MASE (M +/- 95%CI)") 
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

############SOME EXTRA FIGURES: NOT USED IN THE MANUSCRIPT OR SUPPLEMENT##################
#########END###################

###CREATE FIGURE FOR THE MAIN TEXT
data.phase1.MASE.total$Wave<-"First Tournament (May 2020)"
data.phase2.MASE.total$Wave<-"Second Tournament (Nov 2020)"
#combine
means.compare.by.method<-bind_rows(data.phase1.MASE.total,data.phase2.MASE.total)
means.compare.by.method$Method<-means.compare.by.method$Method.code #create a copy to port values to
means.compare.by.method$Method<-c('Data-Driven\n51%','Hybrid\n7%','Intuition/\nTheory\n42%','Data-Driven\n53%','Hybrid\n8%','Intuition/\nTheory\n39%')
#arrange in descending order based on MASE w2 of academics
means.compare.by.method$Wave<-factor(means.compare.by.method$Wave,levels=c("First Tournament (May 2020)","Second Tournament (Nov 2020)"))


#plot figure
### Figure 4 in the manuscript

means.compare.by.method %>%  
 ggplot(aes(x = Method, y = response, color = Method.code))+
 geom_pointrange(aes(ymin=lower.CL, ymax=upper.CL), position=pd)+  theme_minimal(base_size = 14)+geom_hline(yintercept =1, linetype='dotted', color='black',14)+
  geom_hline(yintercept =1.7665, linetype='dashed', color='blue',16)+theme(legend.position="none")+scale_color_futurama()+  labs(y="MASE (M +/- 95%CI)",x="",shape="",color="")+ facet_wrap(~ Wave, scales = "free_x")
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.
## Warning: `geom_hline()`: Ignoring `mapping` because `yintercept` was provided.

examine effects of covariates across both tournaments

#examine effects of covariates

both.sets$inaccuracy_log<-log(both.sets$inaccuracy)

both.sets$Multidisciplinary<-ifelse(both.sets$multi_dis.factor=="Single domain expertise",0,1)
both.sets$covidconditional[is.na(both.sets$covidconditional)]<-0
both.sets$Method.complex[is.na(both.sets$Method.complex)]<-1 #simple when no extra info is provided, because the rest {number of parameters et.) suggests no extra factors considered}
both.sets$multi_dis.factor[is.na(both.sets$multi_dis.factor)]<-"Single domain expertise" #(setting is NA to non multidisciplinary)

###analyses with domain

model.bothTournaments.COVs<-lmer(inaccuracy_log~domain+parameters_coded+Method.complex+covidconditional+CounterFactual_Presence_Final+n_domains+team_discipline.datasci+team_discipline.SBsci+Multidisciplinary+team_size.coded+team_education+confidence+subexpert+Domain_Publications+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(model.bothTournaments.COVs,type="III",test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: inaccuracy_log
##                                     F Df Df.res                Pr(>F)    
## (Intercept)                    8.8232  1 203.01              0.003333 ** 
## domain                        43.6113 11 822.54 < 0.00000000000000022 ***
## parameters_coded               3.0662  1 306.89              0.080934 .  
## Method.complex                 5.3771  1 345.07              0.020986 *  
## covidconditional               0.2663  1 758.90              0.605957    
## CounterFactual_Presence_Final  1.8533  1 805.70              0.173775    
## n_domains                      1.5009  1 142.21              0.222551    
## team_discipline.datasci        1.7808  1 186.34              0.183680    
## team_discipline.SBsci          3.1329  1 194.11              0.078296 .  
## Multidisciplinary              3.0012  1 184.80              0.084871 .  
## team_size.coded                0.1168  1  92.65              0.733253    
## team_education                 1.0394  1 106.45              0.310268    
## confidence                     0.1298  1 674.27              0.718734    
## subexpert                      1.4985  1 651.58              0.221352    
## Domain_Publications            9.0179  1 783.36              0.002759 ** 
## previous_tournament.coded      4.0534  1 102.31              0.046709 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.bothTournaments.COVs, conf.method="boot", digits=5, center=T)
Observations 905
Dependent variable inaccuracy_log
Type Mixed effects linear regression
AIC 1963.92512
BIC 2098.54730
Pseudo-R² (fixed effects) 0.31437
Pseudo-R² (total) 0.57823
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 2.14816 0.65636 3.27284 114.94047 0.00141
domaineasian -0.20992 0.11519 -1.82236 730.09584 0.06881
domainegend -0.82731 0.11756 -7.03758 730.89228 0.00000
domainiafric 0.25099 0.11461 2.19005 720.69929 0.02884
domainiasian -0.62725 0.11303 -5.54956 739.76037 0.00000
domainideoldem 0.05044 0.11234 0.44898 761.18138 0.65358
domainideolrep 0.55978 0.11234 4.98301 761.14852 0.00000
domainigend -0.88894 0.11668 -7.61842 721.31762 0.00000
domainlifesat -0.03872 0.10771 -0.35943 825.46738 0.71937
domainnegaffect 0.12952 0.11093 1.16757 742.38576 0.24335
domainpolar -0.20464 0.11270 -1.81579 771.48398 0.06979
domainposaffect -1.03584 0.11093 -9.33740 742.38576 0.00000
parameters_coded 0.02350 0.01338 1.75669 191.45580 0.08057
Method.complex 0.13552 0.05814 2.33095 220.81715 0.02066
covidconditional -0.04152 0.08017 -0.51793 673.63521 0.60468
CounterFactual_Presence_Final 0.08455 0.06188 1.36627 747.92777 0.17227
n_domains -0.01613 0.01313 -1.22866 80.05299 0.22280
team_discipline.datasci -0.89057 0.66728 -1.33463 107.71491 0.18481
team_discipline.SBsci -1.15326 0.65151 -1.77014 112.73673 0.07941
Multidisciplinary -1.16098 0.67008 -1.73258 106.72209 0.08606
team_size.coded -0.01995 0.05832 -0.34207 50.66636 0.73372
team_education -0.00135 0.00132 -1.02056 58.67533 0.31165
confidence 0.00985 0.02722 0.36193 554.61258 0.71755
subexpert 0.03459 0.02813 1.22975 525.60689 0.21934
Domain_Publications -0.25632 0.08505 -3.01368 711.64152 0.00267
previous_tournament.coded -0.34930 0.17333 -2.01516 56.25872 0.04868
p values calculated using Satterthwaite d.f. ; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.47733
Residual 0.60350
Grouping Variables
Group # groups ICC
team_name 120 0.38484
#Rsq = 0.31437
model.bothTournaments.no.COVs<-lmer(inaccuracy_log~domain+(1|team_name), data=both.sets)
summ(model.bothTournaments.no.COVs, conf.method="boot", digits=3, center=T)
Observations 905
Dependent variable inaccuracy_log
Type Mixed effects linear regression
AIC 1908.340
BIC 1975.652
Pseudo-R² (fixed effects) 0.271
Pseudo-R² (total) 0.535
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 0.959 0.099 9.688 648.740 0.000
domaineasian -0.194 0.117 -1.664 756.707 0.096
domainegend -0.816 0.119 -6.867 755.260 0.000
domainiafric 0.248 0.116 2.141 742.232 0.033
domainiasian -0.611 0.114 -5.374 766.923 0.000
domainideoldem 0.052 0.113 0.458 781.884 0.647
domainideolrep 0.560 0.113 4.967 781.884 0.000
domainigend -0.886 0.118 -7.516 748.728 0.000
domainlifesat 0.027 0.106 0.254 851.015 0.799
domainnegaffect 0.148 0.112 1.324 764.170 0.186
domainpolar -0.202 0.114 -1.780 795.458 0.075
domainposaffect -1.018 0.112 -9.115 764.170 0.000
p values calculated using Satterthwaite d.f. ; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.461
Residual 0.612
Grouping Variables
Group # groups ICC
team_name 120 0.362
#extra analysis with US residents on the team - not included to avoid overfitting.
model.bothTournaments.COVs.incl.US<-lmer(inaccuracy_log~domain+parameters_coded+Method.complex+covidconditional+CounterFactual_Presence_Final+n_domains+team_discipline.datasci+team_discipline.SBsci+Multidisciplinary+non_US+team_size.coded+team_education+confidence+subexpert+Domain_Publications+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(model.bothTournaments.COVs.incl.US,type="III",test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: inaccuracy_log
##                                     F Df Df.res                Pr(>F)    
## (Intercept)                    8.7303  1 199.14              0.003507 ** 
## domain                        43.6010 11 821.55 < 0.00000000000000022 ***
## parameters_coded               3.1267  1 304.63              0.078018 .  
## Method.complex                 5.3307  1 352.89              0.021530 *  
## covidconditional               0.2855  1 760.29              0.593296    
## CounterFactual_Presence_Final  1.8824  1 810.12              0.170447    
## n_domains                      1.4368  1 141.98              0.232649    
## team_discipline.datasci        1.7733  1 181.00              0.184649    
## team_discipline.SBsci          3.1117  1 189.57              0.079341 .  
## Multidisciplinary              2.9489  1 178.71              0.087668 .  
## non_US                         0.0083  1 106.61              0.927774    
## team_size.coded                0.1214  1  91.96              0.728361    
## team_education                 0.9855  1 107.13              0.323094    
## confidence                     0.1204  1 678.84              0.728723    
## subexpert                      1.5181  1 655.67              0.218348    
## Domain_Publications            8.9493  1 785.50              0.002863 ** 
## previous_tournament.coded      3.8318  1 103.33              0.052986 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#no effect of US residency

#xtra analysis without objective expertise to examine partial Rsq 

model.bothTournaments.COVs.no.obj.expertise<-lmer(inaccuracy_log~domain+parameters_coded+Method.complex+covidconditional+CounterFactual_Presence_Final+n_domains+team_discipline.datasci+team_discipline.SBsci+Multidisciplinary+team_size.coded+team_education+confidence+subexpert+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(model.bothTournaments.COVs.no.obj.expertise,type="III",test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: inaccuracy_log
##                                     F Df Df.res                Pr(>F)    
## (Intercept)                    9.3180  1 199.14              0.002579 ** 
## domain                        42.7054 11 821.96 < 0.00000000000000022 ***
## parameters_coded               2.6621  1 307.26              0.103791    
## Method.complex                 5.3476  1 356.48              0.021319 *  
## covidconditional               0.2468  1 769.63              0.619484    
## CounterFactual_Presence_Final  1.5682  1 812.93              0.210829    
## n_domains                      0.8034  1 143.79              0.371572    
## team_discipline.datasci        1.8912  1 183.04              0.170746    
## team_discipline.SBsci          3.2108  1 190.45              0.074742 .  
## Multidisciplinary              2.8968  1 181.65              0.090467 .  
## team_size.coded                0.2492  1  93.38              0.618795    
## team_education                 0.7804  1 107.15              0.378998    
## confidence                     0.2701  1 684.48              0.603449    
## subexpert                      0.4634  1 707.93              0.496271    
## previous_tournament.coded      4.0966  1 103.13              0.045555 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.bothTournaments.COVs.no.obj.expertise, conf.method="boot", digits=5, center=T)
Observations 905
Dependent variable inaccuracy_log
Type Mixed effects linear regression
AIC 1967.83625
BIC 2097.65049
Pseudo-R² (fixed effects) 0.30449
Pseudo-R² (total) 0.58113
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 2.11268 0.66746 3.16524 117.97112 0.00197
domaineasian -0.19836 0.11540 -1.71893 737.53428 0.08605
domainegend -0.81884 0.11779 -6.95142 737.15943 0.00000
domainiafric 0.25235 0.11487 2.19685 727.32701 0.02835
domainiasian -0.61470 0.11323 -5.42868 745.04404 0.00000
domainideoldem 0.02671 0.11237 0.23766 762.78952 0.81221
domainideolrep 0.53601 0.11236 4.77030 762.75613 0.00000
domainigend -0.88276 0.11693 -7.54938 728.05575 0.00000
domainlifesat -0.03846 0.10807 -0.35587 826.79642 0.72203
domainnegaffect 0.12260 0.11118 1.10272 747.73876 0.27050
domainpolar -0.21957 0.11290 -1.94479 774.33699 0.05216
domainposaffect -1.04276 0.11118 -9.37899 747.73876 0.00000
parameters_coded 0.02217 0.01354 1.63687 200.11800 0.10323
Method.complex 0.13688 0.05888 2.32456 239.34959 0.02093
covidconditional -0.04025 0.08073 -0.49854 697.65096 0.61826
CounterFactual_Presence_Final 0.07821 0.06223 1.25669 764.97087 0.20925
n_domains -0.01199 0.01334 -0.89896 85.17075 0.37121
team_discipline.datasci -0.93379 0.67894 -1.37538 110.83196 0.17179
team_discipline.SBsci -1.18759 0.66272 -1.79199 115.80533 0.07575
Multidisciplinary -1.16079 0.68195 -1.70217 109.90543 0.09155
team_size.coded -0.02981 0.05967 -0.49955 53.81236 0.61943
team_education -0.00120 0.00135 -0.88424 62.20751 0.37997
confidence 0.01430 0.02739 0.52199 578.70906 0.60188
subexpert -0.01554 0.02273 -0.68361 609.84069 0.49448
previous_tournament.coded -0.35921 0.17732 -2.02577 59.74427 0.04726
p values calculated using Satterthwaite d.f. ; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.49147
Residual 0.60475
Grouping Variables
Group # groups ICC
team_name 120 0.39775
#Rsq - 0.30449
anova(model.bothTournaments.COVs,model.bothTournaments.COVs.no.obj.expertise)
## refitting model(s) with ML (instead of REML)
## Data: both.sets
## Models:
## model.bothTournaments.COVs.no.obj.expertise: inaccuracy_log ~ domain + parameters_coded + Method.complex + covidconditional + CounterFactual_Presence_Final + n_domains + team_discipline.datasci + team_discipline.SBsci + Multidisciplinary + team_size.coded + team_education + confidence + subexpert + previous_tournament.coded + (1 | team_name)
## model.bothTournaments.COVs: inaccuracy_log ~ domain + parameters_coded + Method.complex + covidconditional + CounterFactual_Presence_Final + n_domains + team_discipline.datasci + team_discipline.SBsci + Multidisciplinary + team_size.coded + team_education + confidence + subexpert + Domain_Publications + previous_tournament.coded + (1 | team_name)
##                                             npar    AIC    BIC  logLik
## model.bothTournaments.COVs.no.obj.expertise   27 1872.6 2002.4 -909.29
## model.bothTournaments.COVs                    28 1864.9 1999.6 -904.47
##                                             -2*log(L)  Chisq Df Pr(>Chisq)   
## model.bothTournaments.COVs.no.obj.expertise    1818.6                        
## model.bothTournaments.COVs                     1808.9 9.6359  1   0.001908 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#flip accuracy and inaccuracy

both.sets$accuracy_log<-both.sets$inaccuracy_log*(-1)
model.bothTournaments.accuracy.COVs<-lmer(accuracy_log~domain+parameters_coded+Method.complex+covidconditional+CounterFactual_Presence_Final+n_domains+team_discipline.datasci+team_discipline.SBsci+Multidisciplinary+team_size.coded+team_education+confidence+subexpert+Domain_Publications+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(model.bothTournaments.accuracy.COVs,type="III",test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: accuracy_log
##                                     F Df Df.res                Pr(>F)    
## (Intercept)                    8.8232  1 203.01              0.003333 ** 
## domain                        43.6113 11 822.54 < 0.00000000000000022 ***
## parameters_coded               3.0662  1 306.89              0.080934 .  
## Method.complex                 5.3771  1 345.07              0.020986 *  
## covidconditional               0.2663  1 758.90              0.605957    
## CounterFactual_Presence_Final  1.8533  1 805.70              0.173775    
## n_domains                      1.5009  1 142.21              0.222551    
## team_discipline.datasci        1.7808  1 186.34              0.183680    
## team_discipline.SBsci          3.1329  1 194.11              0.078296 .  
## Multidisciplinary              3.0012  1 184.80              0.084871 .  
## team_size.coded                0.1168  1  92.65              0.733253    
## team_education                 1.0394  1 106.45              0.310268    
## confidence                     0.1298  1 674.27              0.718734    
## subexpert                      1.4985  1 651.58              0.221352    
## Domain_Publications            9.0179  1 783.36              0.002759 ** 
## previous_tournament.coded      4.0534  1 102.31              0.046709 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.bothTournaments.accuracy.COVs, conf.method="boot", digits=3, center=T)
Observations 905
Dependent variable accuracy_log
Type Mixed effects linear regression
AIC 1963.925
BIC 2098.547
Pseudo-R² (fixed effects) 0.314
Pseudo-R² (total) 0.578
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) -2.148 0.656 -3.273 114.940 0.001
domaineasian 0.210 0.115 1.822 730.096 0.069
domainegend 0.827 0.118 7.038 730.892 0.000
domainiafric -0.251 0.115 -2.190 720.699 0.029
domainiasian 0.627 0.113 5.550 739.760 0.000
domainideoldem -0.050 0.112 -0.449 761.181 0.654
domainideolrep -0.560 0.112 -4.983 761.149 0.000
domainigend 0.889 0.117 7.618 721.318 0.000
domainlifesat 0.039 0.108 0.359 825.467 0.719
domainnegaffect -0.130 0.111 -1.168 742.386 0.243
domainpolar 0.205 0.113 1.816 771.484 0.070
domainposaffect 1.036 0.111 9.337 742.386 0.000
parameters_coded -0.023 0.013 -1.757 191.456 0.081
Method.complex -0.136 0.058 -2.331 220.817 0.021
covidconditional 0.042 0.080 0.518 673.635 0.605
CounterFactual_Presence_Final -0.085 0.062 -1.366 747.928 0.172
n_domains 0.016 0.013 1.229 80.053 0.223
team_discipline.datasci 0.891 0.667 1.335 107.715 0.185
team_discipline.SBsci 1.153 0.652 1.770 112.737 0.079
Multidisciplinary 1.161 0.670 1.733 106.722 0.086
team_size.coded 0.020 0.058 0.342 50.666 0.734
team_education 0.001 0.001 1.021 58.675 0.312
confidence -0.010 0.027 -0.362 554.613 0.718
subexpert -0.035 0.028 -1.230 525.607 0.219
Domain_Publications 0.256 0.085 3.014 711.642 0.003
previous_tournament.coded 0.349 0.173 2.015 56.259 0.049
p values calculated using Satterthwaite d.f. ; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.477
Residual 0.603
Grouping Variables
Group # groups ICC
team_name 120 0.385
plot.COV<-plot_summs(model.bothTournaments.accuracy.COVs, scale = TRUE, robust = "HC1",n.sd = 2, inner_ci_level = .9,
                     coefs = c("Statistical Model Complexity" = "Method.complex","N Model Parameters" = "parameters_coded", 
                     "Considered COVID-19" = "covidconditional",
                     "Considered Counterfactuals"="CounterFactual_Presence_Final",
                     "Number of Predicted Domains"="n_domains",
                     "Data Scientists on the Team"="team_discipline.datasci",
                     "Behav./Soc. Scientists on the Team"="team_discipline.SBsci",
                     "Multidisciplinary"="Multidisciplinary",
                     "Team Size"="team_size.coded",
                     "% without PhD on the Team"="team_education",
                     "Confidence in Forecast"="confidence",
                     "Confidence in Expertise"="subexpert",
                     "Team Members Topic Publications"="Domain_Publications",
                      "Prev. Exp. with\nForecasting Tournaments"="previous_tournament.coded"))
plot.COV$data<-plot.COV$data %>% arrange(estimate) %>%    # First sort by val. This sort the dataframe but NOT the factor levels
  mutate(term=factor(term, levels=term))

### Figure 5 in the manuscript
  plot.COV+theme_pubclean()+labs(y="",x="Contribution to Accuracy",caption = "most negative <===========================================> most positive")

export_summs(model.bothTournaments.accuracy.COVs, scale = TRUE, robust = "HC1",n.sd = 2, to.file = "docx", file.name="indiv.differences.standartized.docx")
Model 1
(Intercept)-2.15 ** 
(0.66)   
domaineasian0.21    
(0.12)   
domainegend0.83 ***
(0.12)   
domainiafric-0.25 *  
(0.11)   
domainiasian0.63 ***
(0.11)   
domainideoldem-0.05    
(0.11)   
domainideolrep-0.56 ***
(0.11)   
domainigend0.89 ***
(0.12)   
domainlifesat0.04    
(0.11)   
domainnegaffect-0.13    
(0.11)   
domainpolar0.20    
(0.11)   
domainposaffect1.04 ***
(0.11)   
parameters_coded-0.13    
(0.08)   
Method.complex-0.20 *  
(0.09)   
covidconditional0.04    
(0.08)   
CounterFactual_Presence_Final-0.08    
(0.06)   
n_domains0.13    
(0.11)   
team_discipline.datasci0.89    
(0.67)   
team_discipline.SBsci1.15    
(0.65)   
Multidisciplinary1.16    
(0.67)   
team_size.coded0.05    
(0.13)   
team_education0.11    
(0.11)   
confidence-0.03    
(0.08)   
subexpert-0.12    
(0.10)   
Domain_Publications0.26 ** 
(0.09)   
previous_tournament.coded0.35 *  
(0.17)   
N905       
N (team_name) 120       
AIC1940.35    
BIC2074.97    
R2 (fixed)0.31    
R2 (total)0.58    
All continuous predictors are mean-centered and scaled by 2 standard deviations. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05.
export_summs(model.bothTournaments.accuracy.COVs, scale = F, robust = "HC1", to.file = "docx", file.name="indiv.differences.unstandartized.docx")
Model 1
(Intercept)-1.96 ** 
(0.66)   
domaineasian0.21    
(0.12)   
domainegend0.83 ***
(0.12)   
domainiafric-0.25 *  
(0.11)   
domainiasian0.63 ***
(0.11)   
domainideoldem-0.05    
(0.11)   
domainideolrep-0.56 ***
(0.11)   
domainigend0.89 ***
(0.12)   
domainlifesat0.04    
(0.11)   
domainnegaffect-0.13    
(0.11)   
domainpolar0.20    
(0.11)   
domainposaffect1.04 ***
(0.11)   
parameters_coded-0.02    
(0.01)   
Method.complex-0.14 *  
(0.06)   
covidconditional0.04    
(0.08)   
CounterFactual_Presence_Final-0.08    
(0.06)   
n_domains0.02    
(0.01)   
team_discipline.datasci0.89    
(0.67)   
team_discipline.SBsci1.15    
(0.65)   
Multidisciplinary1.16    
(0.67)   
team_size.coded0.02    
(0.06)   
team_education0.00    
(0.00)   
confidence-0.01    
(0.03)   
subexpert-0.03    
(0.03)   
Domain_Publications0.26 ** 
(0.09)   
previous_tournament.coded0.35 *  
(0.17)   
N905       
N (team_name) 120       
AIC1963.93    
BIC2098.55    
R2 (fixed)0.31    
R2 (total)0.58    
Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05.
partr2.COV<-partR2(model.bothTournaments.accuracy.COVs,  data=both.sets,
              R2_type = "marginal", nboot = 10, CI = 0.95)
summary(partr2.COV) #obtain effect sizes scores for unique predictors (incremental R2)
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper ndf
##  0.3144 0.2928   0.3531   26 
## 
## ----------
## 
## Part (semi-partial) R2:
## [1] "No partitions selected."
## 
## ----------
## 
## Inclusive R2 (SC^2 * R2):
##  Predictor                     IR2    CI_lower CI_upper
##  domaineasian                  0.0000 0.0000   0.0019  
##  domainegend                   0.0326 0.0233   0.0479  
##  domainiafric                  0.0143 0.0078   0.0221  
##  domainiasian                  0.0205 0.0059   0.0326  
##  domainideoldem                0.0081 0.0021   0.0145  
##  domainideolrep                0.0686 0.0399   0.0778  
##  domainigend                   0.0432 0.0321   0.0544  
##  domainlifesat                 0.0124 0.0042   0.0255  
##  domainnegaffect               0.0137 0.0063   0.0161  
##  domainpolar                   0.0001 0.0000   0.0046  
##  domainposaffect               0.0734 0.0493   0.0873  
##  parameters_coded              0.0077 0.0001   0.0250  
##  Method.complex                0.0104 0.0019   0.0227  
##  covidconditional              0.0014 0.0000   0.0096  
##  CounterFactual_Presence_Final 0.0067 0.0002   0.0171  
##  n_domains                     0.0093 0.0028   0.0353  
##  team_discipline.datasci       0.0026 0.0001   0.0373  
##  team_discipline.SBsci         0.0016 0.0000   0.0461  
##  Multidisciplinary             0.0002 0.0000   0.0123  
##  team_size.coded               0.0064 0.0005   0.0137  
##  team_education                0.0006 0.0000   0.0076  
##  confidence                    0.0005 0.0001   0.0025  
##  subexpert                     0.0031 0.0010   0.0121  
##  Domain_Publications           0.0005 0.0000   0.0099  
##  previous_tournament.coded     0.0118 0.0010   0.0308  
## 
## ----------
## 
## Structure coefficients r(Yhat,x):
##  Predictor                     SC      CI_lower CI_upper
##  domaineasian                   0.0091 -0.0477   0.0726 
##  domainegend                    0.3220  0.2584   0.3750 
##  domainiafric                  -0.2130 -0.2594  -0.1620 
##  domainiasian                   0.2556  0.1304   0.3070 
##  domainideoldem                -0.1609 -0.2064  -0.0713 
##  domainideolrep                -0.4673 -0.4759  -0.3683 
##  domainigend                    0.3705  0.3135   0.4125 
##  domainlifesat                 -0.1988 -0.2846  -0.1111 
##  domainnegaffect               -0.2090 -0.2230  -0.1456 
##  domainpolar                    0.0145 -0.0159   0.1104 
##  domainposaffect                0.4831  0.3922   0.5161 
##  parameters_coded              -0.1561 -0.2849  -0.0063 
##  Method.complex                -0.1823 -0.2780  -0.0776 
##  covidconditional              -0.0668 -0.1764   0.0631 
##  CounterFactual_Presence_Final -0.1462 -0.2389   0.0060 
##  n_domains                      0.1720 -0.0313   0.3186 
##  team_discipline.datasci       -0.0912 -0.3206   0.1007 
##  team_discipline.SBsci          0.0721 -0.1077   0.3532 
##  Multidisciplinary              0.0224 -0.1054   0.1926 
##  team_size.coded                0.1430  0.0176   0.2103 
##  team_education                 0.0447 -0.0493   0.1482 
##  confidence                     0.0396 -0.0506   0.0830 
##  subexpert                     -0.0994 -0.1875   0.0692 
##  Domain_Publications           -0.0395 -0.1576   0.1222 
##  previous_tournament.coded      0.1934  0.0409   0.3068 
## 
## ----------
## 
## Beta weights (standardised estimates)
##  Predictor                     BW      CI_lower CI_upper
##  domaineasian                   0.2099  0.0325   0.3476 
##  domainegend                    0.8273  0.7124   0.9992 
##  domainiafric                  -0.2510 -0.4226  -0.1512 
##  domainiasian                   0.6273  0.3840   0.7613 
##  domainideoldem                -0.0504 -0.2352   0.2094 
##  domainideolrep                -0.5598 -0.7398  -0.3151 
##  domainigend                    0.8889  0.6772   1.0576 
##  domainlifesat                  0.0387 -0.0698   0.2421 
##  domainnegaffect               -0.1295 -0.2644   0.0641 
##  domainpolar                    0.2046  0.0967   0.3563 
##  domainposaffect                1.0358  0.8320   1.1543 
##  parameters_coded              -0.0782 -0.1265  -0.0049 
##  Method.complex                -0.1196 -0.1557  -0.0682 
##  covidconditional               0.0415 -0.1139   0.1814 
##  CounterFactual_Presence_Final -0.0845 -0.1745   0.0088 
##  n_domains                      0.0775  0.0133   0.1526 
##  team_discipline.datasci        0.8906  0.2398   2.3511 
##  team_discipline.SBsci          1.1533  0.3746   2.7274 
##  Multidisciplinary              1.1610  0.2293   2.7271 
##  team_size.coded                0.0270 -0.0807   0.1124 
##  team_education                 0.0642 -0.0037   0.1220 
##  confidence                    -0.0168 -0.0746   0.0200 
##  subexpert                     -0.0724 -0.1469  -0.0217 
##  Domain_Publications            0.2563  0.1847   0.3814 
##  previous_tournament.coded      0.3493  0.2053   0.5480 
## 
## ----------
## 
## Parametric bootstrapping resulted in warnings or messages:
## Check r2obj$boot_warnings and r2obj$boot_messages.
partR2(model.bothTournaments.accuracy.COVs, partvars = 
         c("Domain_Publications","previous_tournament.coded","Method.complex","Multidisciplinary"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.3144 NA       NA       1     26 
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                                                                  
##  Model                                                                         
##  Domain_Publications                                                           
##  previous_tournament.coded                                                     
##  Method.complex                                                                
##  Multidisciplinary                                                             
##  Domain_Publications+previous_tournament.coded                                 
##  Domain_Publications+Method.complex                                            
##  Domain_Publications+Multidisciplinary                                         
##  previous_tournament.coded+Method.complex                                      
##  previous_tournament.coded+Multidisciplinary                                   
##  Method.complex+Multidisciplinary                                              
##  Domain_Publications+previous_tournament.coded+Method.complex                  
##  Domain_Publications+previous_tournament.coded+Multidisciplinary               
##  Domain_Publications+Method.complex+Multidisciplinary                          
##  previous_tournament.coded+Method.complex+Multidisciplinary                    
##  Domain_Publications+previous_tournament.coded+Method.complex+Multidisciplinary
##  R2     CI_lower CI_upper nboot ndf
##  0.3144 NA       NA       1     26 
##  0.0065 NA       NA       1     25 
##  0.0101 NA       NA       1     25 
##  0.0124 NA       NA       1     25 
##  0.0006 NA       NA       1     25 
##  0.0179 NA       NA       1     24 
##  0.0178 NA       NA       1     24 
##  0.0070 NA       NA       1     24 
##  0.0223 NA       NA       1     24 
##  0.0121 NA       NA       1     24 
##  0.0130 NA       NA       1     24 
##  0.0289 NA       NA       1     23 
##  0.0198 NA       NA       1     23 
##  0.0183 NA       NA       1     23 
##  0.0240 NA       NA       1     23 
##  0.0307 NA       NA       1     22
##supplementary  - examine COVID score inaccuracy as predictor - does inaccuracy in predictions depend on COVID-inaccuracy?
#IMPORTANT: only done in Phase 1
phase1_exp$inaccuracy_log<-log(phase1_exp$MASE1_w1)
model.t1.COVID<-lmer(inaccuracy_log~domain+log(MASE1_covid)+(1|team_name), data=phase1_exp)
car::Anova(model.t1.COVID,type="III",test.statistic="F") 
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: inaccuracy_log
##                       F Df Df.res  Pr(>F)  
## (Intercept)      5.9844  1 16.911 0.02567 *
## domain           1.5990 11 10.366 0.23029  
## log(MASE1_covid) 2.0214  1 15.726 0.17462  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.t1.COVID, conf.method="boot", digits=3, center=T) #no significant effect of COVID prediction in accuracy on MASE in accuracy
Observations 31
Dependent variable inaccuracy_log
Type Mixed effects linear regression
AIC 85.027
BIC 106.536
Pseudo-R² (fixed effects) 0.336
Pseudo-R² (total) 0.659
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 1.976 0.746 2.649 11.455 0.022
domaineasian 0.202 0.823 0.246 8.844 0.812
domainegend -0.696 0.941 -0.740 7.278 0.483
domainiafric 0.182 0.941 0.194 7.278 0.852
domainiasian -0.166 0.790 -0.210 8.233 0.839
domainideoldem -0.238 0.941 -0.253 7.278 0.807
domainideolrep 0.121 0.941 0.129 7.278 0.901
domainigend -0.134 0.941 -0.142 7.278 0.891
domainlifesat 0.097 0.764 0.127 8.665 0.902
domainnegaffect -0.552 0.787 -0.702 7.882 0.503
domainpolar -1.063 0.812 -1.309 8.402 0.225
domainposaffect -1.475 0.787 -1.875 7.882 0.098
log(MASE1_covid) -0.587 0.366 -1.606 14.747 0.130
p values calculated using Satterthwaite d.f. ; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.647
Residual 0.665
Grouping Variables
Group # groups ICC
team_name 11 0.486
#supplementary - test interaction between model complexity and phase

model.bothTournaments.accuracy.COVs.phase<-lmer(accuracy_log~phase+domain+phase*parameters_coded+phase*Method.complex+covidconditional+CounterFactual_Presence_Final+n_domains+team_discipline.datasci+team_discipline.SBsci+Multidisciplinary+team_size.coded+team_education+confidence+subexpert+Domain_Publications+previous_tournament.coded+(1|team_name), data=both.sets)
car::Anova(model.bothTournaments.accuracy.COVs.phase,type="III",test.statistic="F") #no significant interaction between phase and complexity (to test the question of lower number of datapoint favors simpler forecasts)
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: accuracy_log
##                                     F Df Df.res                Pr(>F)    
## (Intercept)                   12.8988  1 224.55             0.0004039 ***
## phase                         17.8002  1 846.76            0.00002718 ***
## domain                        47.1308 11 824.32 < 0.00000000000000022 ***
## parameters_coded               0.8307  1 448.45             0.3625540    
## Method.complex                 0.7836  1 509.48             0.3764650    
## covidconditional               0.0454  1 717.53             0.8313278    
## CounterFactual_Presence_Final  0.0311  1 749.97             0.8601715    
## n_domains                      3.2718  1 135.73             0.0726936 .  
## team_discipline.datasci        1.5091  1 201.36             0.2207050    
## team_discipline.SBsci          2.8524  1 211.02             0.0927143 .  
## Multidisciplinary              2.3977  1 199.70             0.1230936    
## team_size.coded                0.0044  1  89.78             0.9472088    
## team_education                 0.8645  1 104.52             0.3546123    
## confidence                     0.7297  1 616.26             0.3933204    
## subexpert                      1.0951  1 593.32             0.2957712    
## Domain_Publications            8.5207  1 747.78             0.0036168 ** 
## previous_tournament.coded      5.5332  1  99.89             0.0206203 *  
## phase:parameters_coded         0.2398  1 818.83             0.6244804    
## phase:Method.complex           0.2062  1 851.90             0.6498957    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#supplementary curiosity check - does % gender qualify confidence

model.confidence.gender<-lmer(confidence~phase+domain+team_gender+(1|team_name), data=both.sets)
car::Anova(model.confidence.gender,type="III",test.statistic="F") #no significant interaction between phase and complexity (to test the question of lower number of datapoint favors simpler forecasts)
## Analysis of Deviance Table (Type III Wald F tests with Kenward-Roger df)
## 
## Response: confidence
##                    F Df Df.res                Pr(>F)    
## (Intercept) 309.1073  1 279.47 < 0.00000000000000022 ***
## phase         4.4346  1 790.22               0.03553 *  
## domain        5.0234 11 793.43          0.0000001328 ***
## team_gender   3.6484  1 121.23               0.05849 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.confidence.gender)
Observations 905
Dependent variable confidence
Type Mixed effects linear regression
AIC 2352.24
BIC 2429.17
Pseudo-R² (fixed effects) 0.05
Pseudo-R² (total) 0.76
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 3.03 0.17 17.58 277.94 0.00
phasesecond 0.11 0.05 2.11 789.50 0.04
domaineasian -0.08 0.14 -0.61 780.86 0.54
domainegend -0.02 0.14 -0.13 781.56 0.90
domainiafric -0.10 0.13 -0.73 776.82 0.46
domainiasian 0.17 0.13 1.27 782.90 0.21
domainideoldem -0.01 0.13 -0.05 784.41 0.96
domainideolrep -0.01 0.13 -0.05 784.41 0.96
domainigend 0.12 0.14 0.88 778.49 0.38
domainlifesat 0.54 0.13 4.24 812.87 0.00
domainnegaffect 0.34 0.13 2.59 781.65 0.01
domainpolar 0.06 0.13 0.44 790.97 0.66
domainposaffect 0.34 0.13 2.59 781.65 0.01
team_gender 0.01 0.00 1.91 120.40 0.06
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 1.24
Residual 0.71
Grouping Variables
Group # groups ICC
team_name 120 0.75

Role of Updating - Phase 2

#proportions of scientists who updated their forecasts
proportions(table(phase1_exp$revised,phase1_exp$domain),margin=2)
##    
##        eafric    easian     egend    iafric    iasian  ideoldem  ideolrep
##   0 0.4090909 0.4230769 0.4761905 0.4000000 0.4000000 0.5882353 0.5882353
##   1 0.5909091 0.5769231 0.5238095 0.6000000 0.6000000 0.4117647 0.4117647
##    
##         igend   lifesat negaffect     polar posaffect
##   0 0.4090909 0.6315789 0.5000000 0.4375000 0.5000000
##   1 0.5909091 0.3684211 0.5000000 0.5625000 0.5000000
#set up the file for analyses
pd <- position_dodge(0.7) # move them .07 to the left and right
dat_phase2$Method.code <- relevel(factor(dat_phase2$Method.code), "Intuition/Theory") #use lay people as a reference group
dat_phase2$method.contrast<-ifelse(dat_phase2$Method.code=='Intuition/Theory',0,1)
dat_phase2$compare_to_naive_rwf_MASE2.update<-ifelse(dat_phase2$compare_to_naive_rwf_MASE_w2!="Equal to Naive rwf",dat_phase2$compare_to_naive_rwf_MASE_w2,ifelse(dat_phase2$compare_to_naive_rwf_MASE_w2=="Equal to Naive rwf","Below Naive rwf",NA))
dat_phase2$compare_to_naive_linear_MASE2.update<-ifelse(dat_phase2$compare_to_naive_linear_MASE_w2!="Equal to Naive linear",dat_phase2$compare_to_naive_linear_MASE_w2,ifelse(dat_phase2$compare_to_naive_linear_MASE_w2=="Equal to Naive linear","Below Naive linear",NA))
dat_phase2$Group[dat_phase2$TournamentStart=="May"&dat_phase2$revised == 0]<-"Original May"
## Warning: Unknown or uninitialised column: `Group`.
dat_phase2$Group[dat_phase2$TournamentStart=="November"&dat_phase2$revised == 0]<-"Original November"
dat_phase2$Group[dat_phase2$TournamentStart=="May"&dat_phase2$revised == 1]<-"Updated May"
dat_phase2$teamS<-as.factor(ifelse(dat_phase2$team_size.coded>=6,3,ifelse(dat_phase2$team_size.coded<6 & dat_phase2$team_size.coded>1,2,ifelse(dat_phase2$team_size.coded==1,1,NA))))
dat_phase2$is_multidisciplinary<-ifelse(dat_phase2$discipline=="Multi-disciplinary",1,0)
dat_phase2$objectivexpert<-ifelse(dat_phase2$pub==1,"Expert",ifelse(dat_phase2$pub==2,"Non Expert",NA))
dat_phase2$covidconditional<-ifelse(dat_phase2$covidcondyn==0,"No",ifelse(dat_phase2$covidcondyn==1,"Yes",NA))

#add historical variability data (as extra variable)
dat_phase2<-dat_phase2 %>% left_join(complexity)
## Joining with `by = join_by(domain)`
#count how many domains per person
dat_phase2<-dat_phase2 %>%group_by(team_name) %>% 
 mutate(n_domains = n())

## EXAMINE EFFECTS OF new teams at phase 2 vs. OG teams who updated their forecasts: Just ACADEMICS
##revised  - Indicates whether or not the team has a matching submission in both phase 1 & 2 for the same domain

#MAIN TEXT ANALYSES####
model.phase2.update<-  lmer(log(MASE1_w2)~Group+(1|ResponseId), data=dat_phase2)
car::Anova(model.phase2.update,type="III") #sig difference between original May and original Nov, but not between updated May and original Nov
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(MASE1_w2)
##               Chisq Df            Pr(>Chisq)    
## (Intercept) 185.286  1 < 0.00000000000000022 ***
## Group        51.494  2      0.00000000000658 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase2.update)
Observations 546
Dependent variable log(MASE1_w2)
Type Mixed effects linear regression
AIC 1337.24
BIC 1358.75
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.18
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 0.98 0.07 13.61 67.27 0.00
GroupOriginal November -0.64 0.10 -6.34 54.52 0.00
GroupUpdated May -0.63 0.10 -6.07 68.65 0.00
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
ResponseId (Intercept) 0.22
Residual 0.79
Grouping Variables
Group # groups ICC
ResponseId 120 0.07
emmeans(model.phase2.update,pairwise ~Group, adjust = "none") #nonsig
## $emmeans
##  Group             emmean     SE   df lower.CL upper.CL
##  Original May       0.984 0.0728 92.7    0.839    1.129
##  Original November  0.343 0.0711 62.6    0.201    0.485
##  Updated May        0.353 0.0752 96.2    0.203    0.502
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                         estimate    SE   df t.ratio p.value
##  Original May - Original November  0.64089 0.102 75.9   6.301  <.0001
##  Original May - Updated May        0.63146 0.105 94.5   6.036  <.0001
##  Original November - Updated May  -0.00943 0.103 77.8  -0.091  0.9276
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale.
#contrast difference of updating forecasts for explicit asian bias, life satisfaction, neg affect, polarization, pos affect
eff_size(emmeans(model.phase2.update,pairwise ~Group, adjust = "none"),sigma = sigma(model.phase2.update), edf =df.residual(model.phase2.update) ) 
## Since 'object' is a list, we are using the contrasts already present.
##  contrast                           effect.size    SE   df lower.CL upper.CL
##  (Original May - Original November)       0.816 0.132 75.9    0.553     1.08
##  (Original May - Updated May)             0.804 0.135 94.5    0.535     1.07
##  (Original November - Updated May)       -0.012 0.132 77.8   -0.274     0.25
## 
## sigma used for effect sizes: 0.7856 
## Degrees-of-freedom method: inherited from kenward-roger when re-gridding 
## Confidence level used: 0.95
#######################

###by type of justification (supplementary)
##first, create the variable
#Just new data as a reason for update
dat_phase2$Group.data[dat_phase2$Group!="Updated May"]<-dat_phase2$Group
## Warning: Unknown or uninitialised column: `Group.data`.
## Warning in dat_phase2$Group.data[dat_phase2$Group != "Updated May"] <-
## dat_phase2$Group: number of items to replace is not a multiple of replacement
## length
dat_phase2$Group.data[dat_phase2$Group=="Updated May"&dat_phase2$justification_dataReceived==1]<-"Data"
dat_phase2$Group.data[dat_phase2$Group=="Updated May"&dat_phase2$justification_theoreticalInsight==1]<-"Theory"
dat_phase2$Group.data[dat_phase2$Group=="Updated May"&dat_phase2$justification_externalEvent==1]<-"Extra"

model.phase2.update.data<-  lmer(log(MASE1_w2)~Group.data+(1|ResponseId), data=dat_phase2)
car::Anova(model.phase2.update.data,type="III") # there is a significant difference in prediction error as a function of types of justifications.
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(MASE1_w2)
##               Chisq Df    Pr(>Chisq)    
## (Intercept)  8.3713  1      0.003812 ** 
## Group.data  45.1323  5 0.00000001364 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(model.phase2.update.data,pairwise ~Group.data, adjust = "none") 
## $emmeans
##  Group.data        emmean     SE    df lower.CL upper.CL
##  Data               0.407 0.1420  81.8    0.125    0.689
##  Extra              0.408 0.1320 183.0    0.147    0.669
##  Original May       0.983 0.0732  77.4    0.837    1.129
##  Original November  0.316 0.0947 112.4    0.129    0.504
##  Theory             0.855 0.8200 468.1   -0.757    2.466
##  Updated May        0.371 0.0963  73.0    0.179    0.563
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                         estimate    SE    df t.ratio p.value
##  Data - Extra                     -0.00145 0.191 150.5  -0.008  0.9939
##  Data - Original May              -0.57624 0.160  80.9  -3.612  0.0005
##  Data - Original November          0.09053 0.171  89.9   0.531  0.5968
##  Data - Theory                    -0.44783 0.832 463.3  -0.538  0.5908
##  Data - Updated May                0.03575 0.171  78.9   0.209  0.8353
##  Extra - Original May             -0.57479 0.151 146.7  -3.803  0.0002
##  Extra - Original November         0.09198 0.163 154.1   0.565  0.5726
##  Extra - Theory                   -0.44638 0.828 469.4  -0.539  0.5901
##  Extra - Updated May               0.03720 0.164 129.3   0.227  0.8205
##  Original May - Original November  0.66678 0.120  97.2   5.571  <.0001
##  Original May - Theory             0.12842 0.823 466.9   0.156  0.8761
##  Original May - Updated May        0.61199 0.121  74.6   5.060  <.0001
##  Original November - Theory       -0.53836 0.826 466.5  -0.652  0.5147
##  Original November - Updated May  -0.05479 0.127 218.6  -0.431  0.6668
##  Theory - Updated May              0.48358 0.826 465.8   0.586  0.5584
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log (not the response) scale.
dat_phase2$MASE1_w2_log<-log(dat_phase2$MASE1_w2)
model.phase2.update.type<-  lmer(MASE1_w2_log~justification_dataReceived+justification_theoreticalInsight+justification_externalEvent+(1|team_name), data=dat_phase2)
car::Anova(model.phase2.update.type,type="III")  
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: MASE1_w2_log
##                                   Chisq Df Pr(>Chisq)   
## (Intercept)                      9.4126  1   0.002155 **
## justification_dataReceived       0.6423  1   0.422890   
## justification_theoreticalInsight 1.3954  1   0.237501   
## justification_externalEvent      0.2338  1   0.628691   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summ(model.phase2.update.type,scale = F, robust = "HC1", digits=3, n.sd = 2)
Observations 162
Dependent variable MASE1_w2_log
Type Mixed effects linear regression
AIC 412.530
BIC 431.055
Pseudo-R² (fixed effects) 0.021
Pseudo-R² (total) 0.173
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 0.359 0.117 3.068 48.213 0.004
justification_dataReceived -0.135 0.169 -0.801 61.505 0.426
justification_theoreticalInsight 0.782 0.662 1.181 78.712 0.241
justification_externalEvent 0.081 0.167 0.484 98.156 0.630
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 0.334
Residual 0.779
Grouping Variables
Group # groups ICC
team_name 38 0.155
#no sig differences
#export_summs(model.phase2.update.type, scale = T, robust = "HC1", n.sd = 2, to.file = "docx")
partR2(model.phase2.update.type, partvars = 
         c("justification_dataReceived","justification_theoreticalInsight","justification_externalEvent"))
## 
## 
## R2 (marginal) and 95% CI for the full model: 
##  R2     CI_lower CI_upper nboot ndf
##  0.0215 NA       NA       1     4  
## 
## ----------
## 
## Part (semi-partial) R2:
##  Predictor(s)                                                                           
##  Model                                                                                  
##  justification_dataReceived                                                             
##  justification_theoreticalInsight                                                       
##  justification_externalEvent                                                            
##  justification_dataReceived+justification_theoreticalInsight                            
##  justification_dataReceived+justification_externalEvent                                 
##  justification_theoreticalInsight+justification_externalEvent                           
##  justification_dataReceived+justification_theoreticalInsight+justification_externalEvent
##  R2     CI_lower CI_upper nboot ndf
##  0.0215 NA       NA       1     4  
##  0.0032 NA       NA       1     3  
##  0.0094 NA       NA       1     3  
##  0.0021 NA       NA       1     3  
##  0.0176 NA       NA       1     2  
##  0.0058 NA       NA       1     2  
##  0.0135 NA       NA       1     2  
##  0.0215 NA       NA       1     1

demographics experts

phase1.exp.dem<-phase1_exp %>% dplyr::select(team_name, team_size.coded, n_domains,team_Age,team_education,non_US,team_gender,revised) %>% dplyr::group_by(team_name) %>% 
summarize_all(~ mean(.x, na.rm = TRUE))

phase2.exp.dem<-dat_phase2 %>% dplyr::select(team_name, team_size.coded, n_domains,team_Age,team_education,non_US,team_gender,revised) %>% dplyr::group_by(team_name) %>% 
summarize_all(~ mean(.x, na.rm = TRUE))


#number of participants
sum(phase1.exp.dem$team_size.coded)
## [1] 135
sum(phase2.exp.dem$team_size.coded)
## [1] 190
#team size
psych::describe(phase1.exp.dem$team_size.coded)
##    vars  n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 86 1.57 1.12      1     1.3   0   1   7     6 2.37     6.09 0.12
psych::describe(phase2.exp.dem$team_size.coded)
##    vars   n mean  sd median trimmed mad min max range skew kurtosis  se
## X1    1 120 1.58 1.1      1    1.32   0   1   7     6  2.2     5.17 0.1
#n forecasted domains
psych::describe(phase1.exp.dem$n_domains)
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 86 4.17 3.78      3    3.64 2.97   1  12    11 1.16    -0.07 0.41
psych::describe(phase2.exp.dem$n_domains)
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 120 4.55 3.88      3    4.06 2.97   1  12    11 0.91    -0.64 0.35
#% participating teams per domain (out of 86/120
phase1_exp %>% dplyr::select(team_name, domain) %>% dplyr::group_by(domain) %>% 
  summarize_all(~ n()) %>% mutate(perc = team_name/86*100)
## # A tibble: 12 × 3
##    domain    team_name  perc
##    <chr>         <int> <dbl>
##  1 eafric           22  25.6
##  2 easian           26  30.2
##  3 egend            21  24.4
##  4 iafric           25  29.1
##  5 iasian           30  34.9
##  6 ideoldem         34  39.5
##  7 ideolrep         34  39.5
##  8 igend            22  25.6
##  9 lifesat          57  66.3
## 10 negaffect        28  32.6
## 11 polar            32  37.2
## 12 posaffect        28  32.6
dat_phase2 %>% dplyr::select(team_name, domain) %>% dplyr::group_by(domain) %>% 
  summarize_all(~ n()) %>% mutate(perc = team_name/120*100)
## # A tibble: 12 × 3
##    domain    team_name  perc
##    <chr>         <int> <dbl>
##  1 eafric           32  26.7
##  2 easian           37  30.8
##  3 egend            36  30  
##  4 iafric           37  30.8
##  5 iasian           42  35  
##  6 ideoldem         47  39.2
##  7 ideolrep         47  39.2
##  8 igend            37  30.8
##  9 lifesat          82  68.3
## 10 negaffect        51  42.5
## 11 polar            47  39.2
## 12 posaffect        51  42.5
# % teams with expertise

phase1_exp %>% dplyr::select(team_name, domain,Domain_Publications) %>% dplyr::group_by(domain,Domain_Publications) %>% 
  summarize_all(~ n()) %>% dplyr::group_by(domain) %>% mutate(perc = team_name/sum(team_name)*100)%>% arrange(-Domain_Publications) %>% filter(Domain_Publications ==1)
## # A tibble: 12 × 4
## # Groups:   domain [12]
##    domain    Domain_Publications team_name  perc
##    <chr>                   <dbl>     <int> <dbl>
##  1 eafric                      1         4 18.2 
##  2 easian                      1         3 11.5 
##  3 egend                       1         3 14.3 
##  4 iafric                      1         4 16   
##  5 iasian                      1         4 13.3 
##  6 ideoldem                    1        14 41.2 
##  7 ideolrep                    1        14 41.2 
##  8 igend                       1         2  9.09
##  9 lifesat                     1        19 33.3 
## 10 negaffect                   1         7 25   
## 11 polar                       1        13 40.6 
## 12 posaffect                   1         7 25
proportions(xtabs( ~ domain+Domain_Publications,phase1_exp),"domain")*100 #by domain
##            Domain_Publications
## domain              0         1
##   eafric    81.818182 18.181818
##   easian    88.461538 11.538462
##   egend     85.714286 14.285714
##   iafric    84.000000 16.000000
##   iasian    86.666667 13.333333
##   ideoldem  58.823529 41.176471
##   ideolrep  58.823529 41.176471
##   igend     90.909091  9.090909
##   lifesat   66.666667 33.333333
##   negaffect 75.000000 25.000000
##   polar     59.375000 40.625000
##   posaffect 75.000000 25.000000
proportions(xtabs( ~ domain+Domain_Publications,dat_phase2),"domain")*100 #by domain
##            Domain_Publications
## domain             0        1
##   eafric    68.75000 31.25000
##   easian    83.78378 16.21622
##   egend     86.11111 13.88889
##   iafric    72.97297 27.02703
##   iasian    80.95238 19.04762
##   ideoldem  55.31915 44.68085
##   ideolrep  55.31915 44.68085
##   igend     89.18919 10.81081
##   lifesat   63.41463 36.58537
##   negaffect 66.66667 33.33333
##   polar     57.44681 42.55319
##   posaffect 66.66667 33.33333
#age
psych::describe(phase1.exp.dem$team_Age)
##    vars  n  mean   sd median trimmed  mad  min max range skew kurtosis   se
## X1    1 73 38.18 8.37     37   37.86 7.91 22.5  63  40.5 0.45    -0.01 0.98
psych::describe(phase2.exp.dem$team_Age)
##    vars   n  mean  sd median trimmed  mad  min max range skew kurtosis   se
## X1    1 106 36.82 8.3     36   36.35 7.41 22.5  63  40.5  0.6     0.29 0.81
#Percentage of team members who indicated their level of education was not PhD
psych::describe(phase1.exp.dem$team_education)
##    vars  n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 86 21.79 39.58      0   15.34   0   0 100   100 1.31    -0.16 4.27
psych::describe(phase2.exp.dem$team_education)
##    vars   n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 120 25.59 41.88      0   19.48   0   0 100   100 1.07    -0.75 3.82
#to get number of team members with a PhD

##multiple % per team without a PhD by number of team members and subtract from 1 and multiple by 100 to get % of forecasts done by PhDs
(1 - sum(phase1.exp.dem$team_education/100*phase1.exp.dem$team_size.coded)/sum(phase1.exp.dem$team_size.coded))*100 #73.33%
## [1] 73.33333
(1 - sum(phase2.exp.dem$team_education/100*phase2.exp.dem$team_size.coded)/sum(phase2.exp.dem$team_size.coded))*100 #67.37%
## [1] 67.36842
# non-US Percentage of team members who indicated their country of residence was not the United States
psych::describe(phase1.exp.dem$non_US)
##    vars  n  mean    sd median trimmed mad min max range  skew kurtosis   se
## X1    1 86 59.69 47.61    100   61.91   0   0 100   100 -0.38    -1.81 5.13
psych::describe(phase2.exp.dem$non_US)
##    vars   n  mean    sd median trimmed mad min max range  skew kurtosis   se
## X1    1 120 59.72 47.82    100   62.16   0   0 100   100 -0.38    -1.82 4.37
(sum(phase1.exp.dem$non_US/100*phase1.exp.dem$team_size.coded)/sum(phase1.exp.dem$team_size.coded))*100 #62.22% nonus
## [1] 62.22222
(sum(phase2.exp.dem$non_US/100*phase2.exp.dem$team_size.coded)/sum(phase2.exp.dem$team_size.coded))*100 #62.10526% nonus
## [1] 62.10526
#gender: Percentage of team members who indicated their gender was either Female or Other
psych::describe(phase1.exp.dem$team_gender)
##    vars  n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 86 19.76 36.36      0   12.85   0   0 100   100 1.49     0.51 3.92
psych::describe(phase2.exp.dem$team_gender)
##    vars   n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 120 21.78 37.48      0   14.72   0   0 100   100 1.34     0.07 3.42
#male phase 1
(1 - sum(phase1.exp.dem$team_gender/100*phase1.exp.dem$team_size.coded)/sum(phase1.exp.dem$team_size.coded))*100 #76.2963% male
## [1] 76.2963
(1 - sum(phase2.exp.dem$team_gender/100*phase2.exp.dem$team_size.coded)/sum(phase2.exp.dem$team_size.coded))*100 #74.47368% male
## [1] 74.47368
#% revised predictions

psych::describe(phase1.exp.dem$revised*100)
##    vars  n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 86 44.19 49.95      0   42.86   0   0 100   100 0.23    -1.97 5.39
#did preference for updating vary by method?
model.revised.t1.by.method<-glmer(revised~method.contrast+(1|team_name), data=phase1_exp, family=binomial)
summ(model.revised.t1.by.method, conf.method="boot", digits=3, center=T)
Observations 359
Dependent variable revised
Type Mixed effects generalized linear model
Family binomial
Link logit
AIC 125.744
BIC 137.394
Pseudo-R² (fixed effects) 0.000
Pseudo-R² (total) 0.999
Fixed Effects
Est. S.E. z val. p
(Intercept) -11.443 1.908 -5.999 0.000
method.contrast -1.033 2.072 -0.498 0.618
; Continuous predictors are mean-centered. The outcome variable remains in its original units.
Random Effects
Group Parameter Std. Dev.
team_name (Intercept) 54.347
Grouping Variables
Group # groups ICC
team_name 86 0.999
#theory vs. intuition in each phase
table(phase1_exp$basis)
## 
##       1     1,2   1,2,4 1,2,4,5   1,2,5     1,3   1,3,4     1,4   1,4,5     1,5 
##      97      41      14       3       4       4       2      25       2       7 
##       2     2,4   2,4,5       4       5 
##       9       9       2     124      16

demographics lay people

phase1.noexp.dem<-phase1 %>% filter(isExpert.factor=="Prolific") %>% dplyr::select(ResponseId,Age, Sex, Ethnicity, Education,Residential.Area, Income) %>% dplyr::group_by(ResponseId) %>% 
summarize_all(~ mean(.x, na.rm = TRUE))
psych::describe(phase1.noexp.dem$Age)
##    vars   n  mean    sd median trimmed mad min max range skew kurtosis   se
## X1    1 741 30.39 10.56     28   28.88 8.9  18  78    60  1.3     1.72 0.39
prop.table(table(phase1.noexp.dem$Sex)) #46.36% female
## 
##           1           2           3 
## 0.529014845 0.465587045 0.005398111
prop.table(table(phase1.noexp.dem$Education))
## 
##           1           2           3           4           5           6 
## 0.004048583 0.075573549 0.251012146 0.037786775 0.422402159 0.160593792 
##           7           8 
## 0.021592443 0.026990553
#"1 =  less than highschool
#2 =  highschool grad
#3 =  some college
#4 =  vocational/technical degree
#5 =  bachelor's
#6 =  masters'
#7 =  doctorate
#8 =  professional degree"
0.416846652+ 0.162706983+ 0.017278618+ 0.029517639 #proportion with a college degree or above
## [1] 0.6263499
0.003599712+ 0.082073434 #high school or less
## [1] 0.08567315
0.245500360+ 0.042476602 #some college
## [1] 0.287977
prop.table(table(phase1.noexp.dem$Ethnicity))
## 
##           1           2           3           4           5           6 
## 0.006756757 0.181081081 0.094594595 0.585135135 0.006756757 0.075675676 
##           7           8           9 
## 0.008108108 0.035135135 0.006756757
#1-9 ranging from aboriginal/native, asian, black, white, middle eastern, hispanic, east indian, mixed race, other/not listed
prop.table(table(phase1.noexp.dem$Income))
## 
##          1          2          3          4          5          6          7 
## 0.06811989 0.07629428 0.08719346 0.12942779 0.21389646 0.16621253 0.14305177 
##          8 
## 0.11580381
#from 0-15k, 15-25k, 25-35k, 35-50k, 50-75k, 75-100k,100-150k, 150k+
prop.table(table(phase1.noexp.dem$Residential.Area))
## 
##         1         2         3 
## 0.3270270 0.5716216 0.1013514
#"1 = Urban,
#2 = Suburban
#3 = rural"